Skip to content

vllm.model_executor.layers.quantization.humming

Classes:

HummingMoEMethod

Bases: FusedMoEMethodBase

Methods:

  • apply

    Apply Humming-quantized MoE computation using the standard kernel flow.

Source code in vllm/model_executor/layers/quantization/humming.py
class HummingMoEMethod(FusedMoEMethodBase):
    def __init__(
        self, quant_config: HummingLayerQuantizationConfig, moe: "FusedMoEConfig"
    ) -> None:
        super().__init__(moe)
        self.quant_config = quant_config
        self.weight_schema = quant_config.weight_schema
        self.input_schema = quant_config.input_schema
        self.force_weight_schema = quant_config.force_weight_schema
        self.force_input_schema = quant_config.force_input_schema

        # Derive QuantKeys from humming schemas.
        # Prefer force schemas (the final format after requant) over base.
        weight_key = weight_schema_to_quant_key(
            self.force_weight_schema or self.weight_schema
        )
        activation_key = input_schema_to_quant_key(
            self.force_input_schema or self.input_schema
        )

        # Select Humming MoE experts
        self.experts_cls = select_humming_moe_experts(
            config=self.moe,
            weight_key=weight_key,
            activation_key=activation_key,
        )

    def prepare_weight_loader(self, layer, weight_loader):
        def new_weight_loader(
            param: torch.nn.Parameter,
            loaded_weight: torch.Tensor,
            weight_name: str,
            shard_id: str,
            expert_id: int | None = None,
            return_success: bool = False,
        ):
            name = param.param_name
            float_dtypes = [torch.float16, torch.bfloat16, torch.float32]
            is_unquantized = name == "weight" and loaded_weight.dtype in float_dtypes
            # online quant (fp16/bf16 -> quant_type)
            if is_unquantized:
                assert isinstance(self.weight_schema, _hm.HummingWeightSchema)
                f16_dtype = _hm.DataType.from_torch_dtype(layer.param_dtype)
                has_global_scale = "TENSOR" in str(self.weight_schema.weight_scale_type)
                tensor_list = _hm.quantize_weight(
                    weight=loaded_weight,
                    dtype=self.weight_schema.b_dtype,
                    scale_dtype=self.weight_schema.bs_dtype or f16_dtype,
                    group_size=self.weight_schema.weight_scale_group_size,
                    has_zero_point=self.weight_schema.has_zero_point,
                    has_global_scale=has_global_scale,
                    is_fp_zero_point=self.weight_schema.is_fp_zero_point,
                    pack=True,
                )

                key_list = ["weight", "weight_scale", "zero_point", "global_scale"]
                success = True
                for key, tensor in zip(key_list, tensor_list):
                    if tensor is None or tensor.nelement() == 0:
                        continue
                    sublayer_name = "w2" if shard_id == "w2" else "w13"

                    param = getattr(layer, sublayer_name + "_" + key)
                    part_success = param.weight_loader(
                        param=param,
                        loaded_weight=tensor.cpu(),
                        weight_name=shard_id + "_" + key,
                        shard_id=shard_id,
                        expert_id=expert_id,
                        return_success=return_success,
                    )
                    success = success and part_success

                return success if return_success else None

            # weight processing logic for specific quantization schema
            loaded_weight = self.weight_schema.process_loaded_weight(
                tensor=loaded_weight,
                name=name,
            )
            return weight_loader(
                param,
                loaded_weight,
                weight_name,
                shard_id=shard_id,
                expert_id=expert_id,
                return_success=return_success,
            )

        return new_weight_loader

    def create_weights(
        self,
        layer: RoutedExperts,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        layer.num_experts = num_experts
        layer.param_dtype = params_dtype
        layer.intermediate_size = intermediate_size_per_partition
        weight_loader = extra_weight_attrs.get("weight_loader", default_weight_loader)
        weight_loader = self.prepare_weight_loader(layer, weight_loader)
        extra_weight_attrs["weight_loader"] = weight_loader

        # sublayer: a layer contains multiple sets of weights for quantized GEMM
        # (e.g., weight, weight_scale, etc.).
        # The weight names of sublayer start with the prefix "{sublayer_name}_"
        layer.sublayer_configs = {
            "w13": {
                "shape_n": intermediate_size_per_partition * 2,
                "shape_k": hidden_size,
                "tensors_attrs": self.weight_schema.get_padded_tensors_attrs(
                    shape_n=intermediate_size_per_partition * 2,
                    shape_k=hidden_size,
                    num_experts=num_experts,
                    param_dtype=params_dtype,
                    has_bias=self.moe.has_bias,
                ),
            },
            "w2": {
                "shape_n": hidden_size,
                "shape_k": intermediate_size_per_partition,
                "tensors_attrs": self.weight_schema.get_padded_tensors_attrs(
                    shape_n=hidden_size,
                    shape_k=intermediate_size_per_partition,
                    num_experts=num_experts,
                    param_dtype=params_dtype,
                    has_bias=self.moe.has_bias,
                ),
            },
        }

        for sublayer_name, configs in layer.sublayer_configs.items():
            for name, attrs in configs["tensors_attrs"].items():
                tensor = torch.empty(attrs["shape"], dtype=attrs["dtype"])
                param = torch.nn.Parameter(tensor, requires_grad=False)
                extra_attrs = attrs.get("extra_attrs", {}).copy()
                extra_attrs.update(extra_weight_attrs)
                param = prepare_moe_param(tensor, name, extra_attrs)
                setattr(layer, f"{sublayer_name}_{name}", param)

        if self.force_input_schema is not None:
            self.input_schema = self.force_input_schema

        locks = torch.zeros(1024, dtype=torch.int32)
        layer.register_buffer("locks", locks)

    def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig:
        return get_humming_moe_quant_config(layer)

    def process_weights_after_loading(self, layer: RoutedExperts) -> None:
        if getattr(self, "processed", False):
            return
        self.processed = True

        # Convert weights to Humming kernel format
        convert_to_humming_moe_kernel_format(
            layer=layer,
            sublayer_configs=layer.sublayer_configs,
            weight_schema=self.weight_schema,
            input_schema=self.input_schema,
            force_weight_schema=self.force_weight_schema,
        )

        # Build the MoE kernel
        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
        assert self.moe_quant_config is not None
        assert self.experts_cls is not None
        self.moe_kernel = make_humming_moe_kernel(
            self.moe_quant_config,
            self.moe,
            self.experts_cls,
            layer=layer,
            routing_tables=layer._expert_routing_tables(),
        )

    def apply(
        self,
        layer: RoutedExperts,
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: SharedExperts | None,
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        """
        Apply Humming-quantized MoE computation using the standard kernel flow.

        This method uses FusedMoEKernel.apply() which orchestrates:
        1. Preparation (quantization if needed - skipped for Humming via
           expects_unquantized_inputs=True to prevent double quantization)
        2. Expert computation (via experts.apply())
        3. Finalization (weight application & reduction - no-op for Humming
           since it's already done internally)

        Humming handles all quantization, weight application, and reduction
        internally in the experts.apply() method via HummingMethod calls.

        Note: Although w1/w2 weights are passed to the kernel for interface
        consistency, Humming's experts.apply() reads weights directly from
        the layer object via HummingMethod.forward_layer() and ignores the
        w1/w2 parameters.
        """
        assert self.moe_kernel is not None
        return self.moe_kernel.apply(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
            topk_ids=topk_ids,
            topk_weights=topk_weights,
            activation=layer.activation,
            global_num_experts=layer.global_num_experts,
            expert_map=layer.expert_map,
            apply_router_weight_on_input=False,
            shared_experts=shared_experts,
            shared_experts_input=shared_experts_input,
        )

apply(layer, x, topk_weights, topk_ids, shared_experts, shared_experts_input)

Apply Humming-quantized MoE computation using the standard kernel flow.

This method uses FusedMoEKernel.apply() which orchestrates: 1. Preparation (quantization if needed - skipped for Humming via expects_unquantized_inputs=True to prevent double quantization) 2. Expert computation (via experts.apply()) 3. Finalization (weight application & reduction - no-op for Humming since it's already done internally)

Humming handles all quantization, weight application, and reduction internally in the experts.apply() method via HummingMethod calls.

Note: Although w1/w2 weights are passed to the kernel for interface consistency, Humming's experts.apply() reads weights directly from the layer object via HummingMethod.forward_layer() and ignores the w1/w2 parameters.

Source code in vllm/model_executor/layers/quantization/humming.py
def apply(
    self,
    layer: RoutedExperts,
    x: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    shared_experts: SharedExperts | None,
    shared_experts_input: torch.Tensor | None,
) -> torch.Tensor:
    """
    Apply Humming-quantized MoE computation using the standard kernel flow.

    This method uses FusedMoEKernel.apply() which orchestrates:
    1. Preparation (quantization if needed - skipped for Humming via
       expects_unquantized_inputs=True to prevent double quantization)
    2. Expert computation (via experts.apply())
    3. Finalization (weight application & reduction - no-op for Humming
       since it's already done internally)

    Humming handles all quantization, weight application, and reduction
    internally in the experts.apply() method via HummingMethod calls.

    Note: Although w1/w2 weights are passed to the kernel for interface
    consistency, Humming's experts.apply() reads weights directly from
    the layer object via HummingMethod.forward_layer() and ignores the
    w1/w2 parameters.
    """
    assert self.moe_kernel is not None
    return self.moe_kernel.apply(
        hidden_states=x,
        w1=layer.w13_weight,
        w2=layer.w2_weight,
        topk_ids=topk_ids,
        topk_weights=topk_weights,
        activation=layer.activation,
        global_num_experts=layer.global_num_experts,
        expert_map=layer.expert_map,
        apply_router_weight_on_input=False,
        shared_experts=shared_experts,
        shared_experts_input=shared_experts_input,
    )