Skip to content

llmcompressor.modeling.qwen3_5_moe

Classes:

CalibrationQwen3_5MoeSparseMoeBlock

CalibrationQwen3_5MoeSparseMoeBlock(
    original: Qwen3_5MoeSparseMoeBlock,
    config,
    calibrate_all_experts: bool = True,
)

Bases: MoECalibrationModule

Calibration version of Qwen3_5MoeSparseMoeBlock that unfuses 3D expert parameters into individual MLP modules (nn.Linear) so they can be individually quantized. Sends all tokens to all experts during calibration.

is_permanent = True because the unfused structure must persist for quantization to target the individual nn.Linear expert weights.

Source code in src/llmcompressor/modeling/qwen3_5_moe.py
def __init__(
    self,
    original: Qwen3_5MoeSparseMoeBlock,
    config,
    calibrate_all_experts: bool = True,
):
    super().__init__()
    text_config = getattr(config, "text_config", config)

    self.calibrate_all_experts = calibrate_all_experts

    # Use plain Linear for gate so module_type() returns "Linear"
    # This ensures gates appear in the ignore list when config is saved
    original_weight = original.gate.weight.data
    self.gate = torch.nn.Linear(
        text_config.hidden_size, text_config.num_experts, bias=False
    )
    self.gate.weight.data = self.gate.weight.data.to(
        dtype=original_weight.dtype, device=original_weight.device
    )
    self.gate.weight.data.copy_(original_weight)

    # Store routing parameters needed for forward pass
    self.top_k = text_config.num_experts_per_tok
    self.num_experts = text_config.num_experts
    self.hidden_dim = text_config.hidden_size
    self.hidden_size = text_config.hidden_size

    self.shared_expert = original.shared_expert
    self.shared_expert_gate = original.shared_expert_gate
    self.experts = SequentialQwen3_5MoeExperts(text_config, original.experts)

SequentialQwen3_5MoeExperts

SequentialQwen3_5MoeExperts(config, original)

Bases: ModuleList

Unfuses 3D expert parameter tensors into individual Qwen3_5MoeMLP modules so that each expert's weights are nn.Linear and can be targeted by quantization with targets="Linear".

Source code in src/llmcompressor/modeling/qwen3_5_moe.py
def __init__(self, config, original):
    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
        Qwen3_5MoeMLP,
    )

    self.num_experts = config.num_experts
    intermediate_size = config.moe_intermediate_size

    with skip_weights_initialize():
        super().__init__(
            [
                Qwen3_5MoeMLP(config, intermediate_size=intermediate_size)
                for _ in range(self.num_experts)
            ]
        )

    gate_up_data = original.gate_up_proj.data  # [num_experts, 2*inter, hidden]
    down_data = original.down_proj.data  # [num_experts, hidden, inter]

    for i in range(self.num_experts):
        gate_up = gate_up_data[i]  # [2*intermediate, hidden]
        down = down_data[i]  # [hidden, intermediate]

        # gate_up_proj stores [gate; up] stacked along dim 0
        # nn.Linear weight is [out_features, in_features]
        self[i].gate_proj.weight.data = (
            gate_up[:intermediate_size, :].clone().contiguous()
        )
        self[i].up_proj.weight.data = (
            gate_up[intermediate_size:, :].clone().contiguous()
        )
        self[i].down_proj.weight.data = down.clone().contiguous()