Skip to content

vllm.model_executor.layers.quantization.utils.humming_utils

Functions:

_convert_sublayer_to_humming(layer, sublayer_name, shape_n, shape_k, weight_schema, input_schema, num_experts, param_dtype)

Convert a sublayer's weights from checkpoint format to Humming format.

Returns:

  • tuple[Any, Any]

    Tuple of (converted_weight_schema, converted_input_schema)

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def _convert_sublayer_to_humming(
    layer: "RoutedExperts",
    sublayer_name: str,
    shape_n: int,
    shape_k: int,
    weight_schema: Any,
    input_schema: Any,
    num_experts: int,
    param_dtype: torch.dtype,
) -> tuple[Any, Any]:
    """
    Convert a sublayer's weights from checkpoint format to Humming format.

    Returns:
        Tuple of (converted_weight_schema, converted_input_schema)
    """
    from humming.schema import HummingWeightSchema

    if isinstance(weight_schema, HummingWeightSchema):
        # Already in Humming format
        return weight_schema, input_schema

    tensors = _extract_sublayer_tensors(layer, sublayer_name)

    shape_k_stacks = [shape_k]
    shape_n_stacks = [shape_n]
    if sublayer_name == "w13":
        shape_n_stacks = [shape_n // 2] * 2

    converted_weight_schema, converted_tensors = weight_schema.convert_humming(
        tensors=tensors,
        shape_n_stacks=shape_n_stacks,
        shape_k_stacks=shape_k_stacks,
        param_dtype=param_dtype,
        num_experts=num_experts,
    )

    converted_input_schema, _ = input_schema.convert_humming(
        tensors=converted_tensors,
        shape_n_stacks=shape_n_stacks,
        shape_k_stacks=shape_k_stacks,
        param_dtype=param_dtype,
        num_experts=num_experts,
    )

    _replace_layer_parameters(layer, sublayer_name, converted_tensors)

    return converted_weight_schema, converted_input_schema

_extract_sublayer_tensors(layer, sublayer_name)

Extract tensors for a specific sublayer from the layer's state dict.

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def _extract_sublayer_tensors(
    layer: "RoutedExperts",
    sublayer_name: str,
) -> dict[str, torch.Tensor]:
    """Extract tensors for a specific sublayer from the layer's state dict."""
    return dict(
        (key.removeprefix(sublayer_name + "_"), value)
        for key, value in layer.state_dict().items()
        if key.startswith(sublayer_name + "_")
    )

_group_shape(group_size, group_size_n=0)

Map humming group sizes to QuantKey GroupShape.

group_size: elements per group along K (col); 0 means full dimension. group_size_n: elements per group along N (row); 0 means 1 (per-row).

GroupShape convention: row = N dim, col = K dim.

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def _group_shape(group_size: int, group_size_n: int = 0) -> GroupShape:
    """
    Map humming group sizes to QuantKey GroupShape.

    group_size:   elements per group along K (col); 0 means full dimension.
    group_size_n: elements per group along N (row); 0 means 1 (per-row).

    GroupShape convention: row = N dim, col = K dim.
    """
    if group_size == 0 and group_size_n == 0:
        return GroupShape.PER_CHANNEL

    row = group_size_n if group_size_n > 0 else 1
    col = group_size if group_size > 0 else -1
    return GroupShape(row=row, col=col)

_humming_input_schema_to_quant_key(schema)

Convert a HummingInputSchema to a QuantKey. Returns None if the schema represents unquantized (bf16/fp16) inputs.

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def _humming_input_schema_to_quant_key(
    schema: "HummingInputSchema",
) -> QuantKey | None:
    """Convert a HummingInputSchema to a QuantKey. Returns None if
    the schema represents unquantized (bf16/fp16) inputs."""
    if schema.a_dtype is None or schema.a_dtype.num_bits >= 16:
        return None

    dtype = _HUMMING_TO_QUANT_DTYPE[schema.a_dtype]

    gs = schema.input_scale_group_size
    group_shape = GroupShape(row=1, col=gs) if gs > 0 else GroupShape.PER_TOKEN

    scale_dtype = MXFP_SCALE_DTYPE if gs > 0 else torch.float32

    scale = ScaleDesc(dtype=scale_dtype, static=False, group_shape=group_shape)

    return QuantKey(dtype=dtype, scale=scale, symmetric=True)

_prepare_and_transform_sublayer(layer, sublayer_name, shape_n, shape_k, weight_schema, input_schema, has_bias, num_experts, param_dtype)

Prepare layer metadata and transform weights for a sublayer.

This calls Humming's prepare_layer_meta and transform_humming_layer.

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def _prepare_and_transform_sublayer(
    layer: "RoutedExperts",
    sublayer_name: str,
    shape_n: int,
    shape_k: int,
    weight_schema: Any,
    input_schema: Any,
    has_bias: bool,
    num_experts: int,
    param_dtype: torch.dtype,
) -> None:
    """
    Prepare layer metadata and transform weights for a sublayer.

    This calls Humming's prepare_layer_meta and transform_humming_layer.
    """
    from humming.layer import HummingMethod

    HummingMethod.prepare_layer_meta(
        layer=layer,
        shape_n=shape_n,
        shape_k=shape_k,
        pad_n_to_multiple=256,
        pad_k_to_multiple=128,
        input_schema=input_schema,
        weight_schema=weight_schema,
        has_bias=has_bias,
        num_experts=num_experts,
        torch_dtype=param_dtype,
        sublayer_name=sublayer_name,
    )

    HummingMethod.transform_humming_layer(layer, sublayer_name=sublayer_name)

_process_single_sublayer(layer, sublayer_name, shape_n, shape_k, weight_schema, input_schema, has_bias, num_experts, param_dtype, force_weight_schema=None)

Process a single sublayer: convert, optionally requant, prepare, and transform.

This combines the common logic from convert_to_humming_moe_kernel_format for processing a single sublayer.

Parameters:

  • layer

    (RoutedExperts) –

    The RoutedExperts layer

  • sublayer_name

    (str) –

    Name of the sublayer (e.g., "w13", "w2")

  • shape_n

    (int) –

    Output dimension size

  • shape_k

    (int) –

    Input dimension size

  • weight_schema

    (Any) –

    Initial weight quantization schema

  • input_schema

    (Any) –

    Initial input quantization schema

  • has_bias

    (bool) –

    Whether the layer has bias terms

  • num_experts

    (int) –

    Number of experts

  • param_dtype

    (dtype) –

    Parameter data type

  • force_weight_schema

    (Any | None, default: None ) –

    Optional schema to force requantization to

Returns:

  • tuple[Any, Any]

    Tuple of (final_weight_schema, final_input_schema)

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def _process_single_sublayer(
    layer: "RoutedExperts",
    sublayer_name: str,
    shape_n: int,
    shape_k: int,
    weight_schema: Any,
    input_schema: Any,
    has_bias: bool,
    num_experts: int,
    param_dtype: torch.dtype,
    force_weight_schema: Any | None = None,
) -> tuple[Any, Any]:
    """
    Process a single sublayer: convert, optionally requant, prepare, and transform.

    This combines the common logic from convert_to_humming_moe_kernel_format
    for processing a single sublayer.

    Args:
        layer: The RoutedExperts layer
        sublayer_name: Name of the sublayer (e.g., "w13", "w2")
        shape_n: Output dimension size
        shape_k: Input dimension size
        weight_schema: Initial weight quantization schema
        input_schema: Initial input quantization schema
        has_bias: Whether the layer has bias terms
        num_experts: Number of experts
        param_dtype: Parameter data type
        force_weight_schema: Optional schema to force requantization to

    Returns:
        Tuple of (final_weight_schema, final_input_schema)
    """
    from humming.schema import HummingWeightSchema

    # Step 1: Convert from checkpoint format to humming format if needed
    current_weight_schema, current_input_schema = _convert_sublayer_to_humming(
        layer=layer,
        sublayer_name=sublayer_name,
        shape_n=shape_n,
        shape_k=shape_k,
        weight_schema=weight_schema,
        input_schema=input_schema,
        num_experts=num_experts,
        param_dtype=param_dtype,
    )

    # Step 2: Force requant if needed
    assert isinstance(current_weight_schema, HummingWeightSchema)
    if force_weight_schema is not None and current_weight_schema != force_weight_schema:
        tensors = _extract_sublayer_tensors(layer, sublayer_name)

        tensors = current_weight_schema.requant_tensors(
            tensors=tensors,
            target_weight_schema=force_weight_schema,
            param_dtype=param_dtype,
        )

        current_weight_schema = force_weight_schema
        _replace_layer_parameters(layer, sublayer_name, tensors, preserve_bias=True)
        del tensors

    # Step 3: Prepare layer metadata and transform weights
    _prepare_and_transform_sublayer(
        layer=layer,
        sublayer_name=sublayer_name,
        shape_n=shape_n,
        shape_k=shape_k,
        weight_schema=current_weight_schema,
        input_schema=current_input_schema,
        has_bias=has_bias,
        num_experts=num_experts,
        param_dtype=param_dtype,
    )

    return current_weight_schema, current_input_schema

_replace_layer_parameters(layer, sublayer_name, tensors, preserve_bias=False)

Replace layer parameters for a sublayer with new tensors.

Parameters:

  • layer

    (RoutedExperts) –

    The RoutedExperts layer

  • sublayer_name

    (str) –

    Name of the sublayer (e.g., "w13", "w2")

  • tensors

    (dict[str, Tensor]) –

    Dict of parameter name to tensor

  • preserve_bias

    (bool, default: False ) –

    If True, don't delete bias parameters

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def _replace_layer_parameters(
    layer: "RoutedExperts",
    sublayer_name: str,
    tensors: dict[str, torch.Tensor],
    preserve_bias: bool = False,
) -> None:
    """
    Replace layer parameters for a sublayer with new tensors.

    Args:
        layer: The RoutedExperts layer
        sublayer_name: Name of the sublayer (e.g., "w13", "w2")
        tensors: Dict of parameter name to tensor
        preserve_bias: If True, don't delete bias parameters
    """
    # Delete old parameters
    for name, _ in list(layer.named_parameters()):
        if not name.startswith(sublayer_name + "_"):
            continue
        if preserve_bias and name == sublayer_name + "_bias":
            continue
        delattr(layer, name)

    # Set new parameters
    for name, tensor in tensors.items():
        param_name = f"{sublayer_name}_{name}"
        param = torch.nn.Parameter(tensor, requires_grad=False)
        setattr(layer, param_name, param)

convert_linear_layer_to_humming_standard(layer, name_map)

Rename/reshape a linear layer's quantized params (the canonical MPLinear layout: weight_packed int32 + weight_scale) into the parameter names and layout humming's weight schema expects (weight / weight_scale).

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def convert_linear_layer_to_humming_standard(
    layer: LinearBase, name_map: dict[str, str]
):
    """Rename/reshape a linear layer's quantized params (the canonical MPLinear
    layout: ``weight_packed`` int32 + ``weight_scale``) into the parameter names
    and layout humming's weight schema expects (``weight`` / ``weight_scale``)."""
    for name, checkpoint_name in name_map.items():
        tensor = getattr(layer, checkpoint_name)
        delattr(layer, checkpoint_name)

        if name == "weight":
            input_dim = getattr(tensor, "input_dim", 1)
            output_dim = getattr(tensor, "output_dim", 0)

            if input_dim == 0 and output_dim == 1:
                tensor = tensor.transpose(1, 0).contiguous()
            else:
                assert output_dim == 0 and input_dim == 1

            tensor = tensor.view(tensor.size(0), -1).view(torch.int32)
        elif name in ["weight_scale", "zero_point"]:
            if getattr(tensor, "output_dim", 0) == 1:
                tensor = tensor.transpose(0, 1).contiguous()
            if tensor.ndim == 1:
                tensor = tensor.unsqueeze(1)

            tensor = tensor.view(torch.int32) if name == "zero_point" else tensor

        if isinstance(tensor, torch.nn.Parameter):
            param = tensor
        else:
            param = torch.nn.Parameter(tensor, requires_grad=False)

        setattr(layer, name, param)

convert_to_humming_moe_kernel_format(layer, quant_config=None, sublayer_configs=None, weight_schema=None, input_schema=None, force_weight_schema=None)

Convert MoE weights from checkpoint format to Humming kernel format.

This function processes weights for each sublayer (w13, w2) by: 1. Converting from checkpoint format to humming format if needed 2. Force requanting if a different quantization schema is specified 3. Preparing layer metadata for the Humming kernel 4. Transforming weights for inference

Parameters:

  • layer

    (RoutedExperts) –

    The RoutedExperts layer containing weights to process

  • quant_config

    (dict | None, default: None ) –

    Optional quantization config dict. Required if weight_schema or input_schema are None. Used to build schemas via BaseWeightSchema.from_config().

  • sublayer_configs

    (dict[str, Any] | None, default: None ) –

    Optional configuration dict for each sublayer (w13, w2). Each config must have "shape_n" and "shape_k" keys. If None, configs are built from layer.moe_config properties.

  • weight_schema

    (Any | None, default: None ) –

    Optional initial weight quantization schema. If None, built from quant_config.

  • input_schema

    (Any | None, default: None ) –

    Optional initial input quantization schema. If None, built from quant_config or env vars.

  • force_weight_schema

    (Any | None, default: None ) –

    Optional schema to force requantization to

Side effects
  • Modifies layer parameters in place
  • Sets layer.weight_schemas and layer.input_schemas
Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def convert_to_humming_moe_kernel_format(
    layer: "RoutedExperts",
    quant_config: dict | None = None,
    sublayer_configs: dict[str, Any] | None = None,
    weight_schema: Any | None = None,
    input_schema: Any | None = None,
    force_weight_schema: Any | None = None,
) -> None:
    """
    Convert MoE weights from checkpoint format to Humming kernel format.

    This function processes weights for each sublayer (w13, w2) by:
    1. Converting from checkpoint format to humming format if needed
    2. Force requanting if a different quantization schema is specified
    3. Preparing layer metadata for the Humming kernel
    4. Transforming weights for inference

    Args:
        layer: The RoutedExperts layer containing weights to process
        quant_config: Optional quantization config dict. Required if weight_schema
                     or input_schema are None. Used to build schemas via
                     BaseWeightSchema.from_config().
        sublayer_configs: Optional configuration dict for each sublayer (w13, w2).
                         Each config must have "shape_n" and "shape_k" keys.
                         If None, configs are built from layer.moe_config properties.
        weight_schema: Optional initial weight quantization schema.
                      If None, built from quant_config.
        input_schema: Optional initial input quantization schema.
                     If None, built from quant_config or env vars.
        force_weight_schema: Optional schema to force requantization to

    Side effects:
        - Modifies layer parameters in place
        - Sets layer.weight_schemas and layer.input_schemas
    """

    # Build schemas from quant_config if not provided
    has_bias = layer.moe_config.has_bias
    num_experts = layer.moe_config.num_local_experts
    param_dtype = layer.params_dtype

    if weight_schema is None or input_schema is None:
        if quant_config is None:
            raise ValueError(
                "Must provide either weight_schema/input_schema or quant_config"
            )

        from humming.layer import HummingInputSchema
        from humming.schema import BaseWeightSchema

        from vllm.model_executor.layers.quantization.utils.humming_utils import (
            humming_is_layer_skipped,
        )

        if weight_schema is None:
            weight_schema = BaseWeightSchema.from_config(quant_config)

        if input_schema is None:
            input_quant_config = envs.VLLM_HUMMING_INPUT_QUANT_CONFIG or {}
            if humming_is_layer_skipped(input_quant_config, layer.layer_name):
                input_schema = HummingInputSchema()
            else:
                # TODO: read input_quant_config from quant_config
                input_schema = HummingInputSchema.from_config(input_quant_config)

    # Build sublayer configs from layer properties if not provided
    if sublayer_configs is None:
        is_gated = layer.moe_config.activation.is_gated
        sublayer_configs = {
            "w13": {
                "shape_n": layer.moe_config.intermediate_size_per_partition * 2,
                "shape_k": layer.moe_config.hidden_dim,
            },
            "w2": {
                "shape_n": layer.moe_config.hidden_dim,
                "shape_k": layer.moe_config.intermediate_size_per_partition
                * (1 if is_gated else 2),
            },
        }

    layer.weight_schemas = {}
    layer.input_schemas = {}

    for sublayer_name, configs in sublayer_configs.items():
        final_weight_schema, final_input_schema = _process_single_sublayer(
            layer=layer,
            sublayer_name=sublayer_name,
            shape_n=configs["shape_n"],
            shape_k=configs["shape_k"],
            weight_schema=weight_schema,
            input_schema=input_schema,
            has_bias=has_bias,
            num_experts=num_experts,
            param_dtype=param_dtype,
            force_weight_schema=force_weight_schema,
        )

        layer.weight_schemas[sublayer_name] = final_weight_schema
        layer.input_schemas[sublayer_name] = final_input_schema

    if not hasattr(layer, "locks"):
        device = layer.w13_weight.device
        locks = torch.zeros(1024, dtype=torch.int32, device=device)
        layer.register_buffer("locks", locks)

select_humming_moe_experts(config, weight_key, activation_key)

Select the primary Humming MoE Experts class Note: Shape-specific fallbacks may still occur at runtime.

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def select_humming_moe_experts(
    config: FusedMoEConfig,
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
) -> type[mk.FusedMoEExperts] | None:
    """
    Select the primary Humming MoE Experts class
    Note: Shape-specific fallbacks may still occur at runtime.
    """

    if not has_humming():
        return None

    # NOTE: the kernels are selected in the following order.
    AVAILABLE_EXPERTS: list[type[mk.FusedMoEExperts]] = [
        BatchedHummingGroupedExperts,
        HummingGroupedExperts,
        HummingIndexedExperts,
    ]

    # NOTE(rob): We need to peak into the P/F selection to determine
    # if we are using the batched or standard expert format, which
    # if not ideal. Once we unify TP + DP/EP, we can select P/F first.
    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )

    def _make_log_backend(experts_cls: type[mk.FusedMoEExperts]):
        return f"Using {experts_cls.__name__} Humming MoE backend."

    def _make_log_unsupported(
        experts_cls: type[mk.FusedMoEExperts], reason: str | None
    ) -> str:
        if reason:
            return (
                f"Humming MoE experts {experts_cls.__name__} does not support the "
                f"deployment configuration since {reason}."
            )
        else:
            return (
                f"Humming MoE experts '{experts_cls.__name__}' does not support the "
                "deployment configuration."
            )

    for k_cls in AVAILABLE_EXPERTS:
        supported, reason = k_cls.is_supported_config(
            k_cls,
            config,
            weight_key,
            activation_key,
            activation_format,
        )
        if supported:
            logger.info_once(_make_log_backend(k_cls))
            return k_cls
        else:
            logger.debug_once(_make_log_unsupported(k_cls, reason))

    return None