`vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe` ¶

Functions:

inject_shared_expert_weights –

Merge routed topk results with the shared expert buffer and inject
rocm_aiter_fused_experts –

ROCm AITER fused MoE expert computation.

`inject_shared_expert_weights(topk_weights, topk_ids, topk, num_fused_shared_experts, shared_expert_weights=None)` ¶

Merge routed topk results with the shared expert buffer and inject dynamic per-token shared expert gate values for AITER fusion.

For routers that already return the combined buffer (e.g. GroupedTopKRouter via rocm_aiter_grouped_topk), only the dynamic weight injection is needed. For routers that return only routed slots (e.g. FusedTopKRouter), this also copies the routed results into the pre-allocated combined buffer.

Source code in vllm/model_executor/layers/fused_moe/experts/rocm_aiter_moe.py

def inject_shared_expert_weights(
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    topk: int,
    num_fused_shared_experts: int,
    shared_expert_weights: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Merge routed topk results with the shared expert buffer and inject
    dynamic per-token shared expert gate values for AITER fusion.

    For routers that already return the combined buffer (e.g. GroupedTopKRouter
    via rocm_aiter_grouped_topk), only the dynamic weight injection is needed.
    For routers that return only routed slots (e.g. FusedTopKRouter), this also
    copies the routed results into the pre-allocated combined buffer.
    """
    if num_fused_shared_experts == 0:
        return topk_weights, topk_ids

    assert aiter_topK_meta_data is not None, (
        "aiter_topK_meta_data is not initialized but "
        "num_fused_shared_experts > 0. Ensure init_aiter_topK_meta_data "
        "is called before routing."
    )

    total_topk_weights, total_topk_ids = aiter_topK_meta_data
    token = topk_weights.shape[0]

    assert total_topk_weights.shape[0] >= token, (
        f"AITER topK meta data supports {total_topk_weights.shape[0]} "
        f"tokens, but got {token} tokens."
    )

    total_topk_weights_slice = total_topk_weights[:token]
    total_topk_ids_slice = total_topk_ids[:token]

    if topk_weights.shape[1] == topk:
        total_topk_weights_slice[:, :topk] = topk_weights
        total_topk_ids_slice[:, :topk] = topk_ids
        topk_weights = total_topk_weights_slice
        topk_ids = total_topk_ids_slice

    if shared_expert_weights is not None:
        topk_weights[:, topk : topk + num_fused_shared_experts] = shared_expert_weights[
            :token
        ]

    return topk_weights, topk_ids

`rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, moe_config, activation=MoEActivation.SILU, apply_router_weight_on_input=False, expert_map=None, quant_config=None, a1q_scale=None, num_local_tokens=None, output_dtype=None, moe_sorting_dispatch_policy=0)` ¶

ROCm AITER fused MoE expert computation.

Source code in vllm/model_executor/layers/fused_moe/experts/rocm_aiter_moe.py

def rocm_aiter_fused_experts(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    moe_config: FusedMoEConfig,
    activation: MoEActivation = MoEActivation.SILU,
    apply_router_weight_on_input: bool = False,
    expert_map: torch.Tensor | None = None,
    quant_config: FusedMoEQuantConfig | None = None,
    a1q_scale: torch.Tensor | None = None,
    num_local_tokens: torch.Tensor | None = None,
    output_dtype: torch.dtype | None = None,
    moe_sorting_dispatch_policy: int = 0,
) -> torch.Tensor:
    """ROCm AITER fused MoE expert computation."""
    if quant_config is None:
        quant_config = FUSED_MOE_UNQUANTIZED_CONFIG

    # Gate/up interleave hint; only the SWIGLUOAI activations override it.
    activation_interleave = None
    if activation == MoEActivation.SILU:
        activation_method = ActivationMethod.SILU
    elif activation == MoEActivation.GELU:
        activation_method = ActivationMethod.GELU
    elif activation == MoEActivation.SWIGLUOAI:
        activation_method = rocm_aiter_ops.get_aiter_activation_type("swiglu")
    elif activation == MoEActivation.SWIGLUOAI_UNINTERLEAVE:
        activation_method = rocm_aiter_ops.get_aiter_activation_type("swiglu")
        activation_interleave = False
    else:
        raise ValueError(f"Unsupported activation: {activation}")

    # All AITER Fused MoE kernels are expecting the following datatypes
    topk_weights = topk_weights.to(torch.float32)
    topk_ids = topk_ids.to(torch.int32)

    expert_mask = expert_map if expert_map is not None else None

    # w8a8 per-channel quantization
    if (
        quant_config.per_act_token_quant
        and apply_router_weight_on_input
        and quant_config.use_fp8_w8a8
    ):
        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
        # This applies topk_weights on the GEMM output of the first FC layer
        #  rather than the second FC.
        assert topk_weights.dim() == 2, (
            "`topk_weights` should be in shape (num_tokens, topk)"
        )
        assert topk_weights.shape[-1] == 1, (
            "Only support topk=1 when `apply_router_weight_on_input` is True"
        )
        assert num_local_tokens is None, (
            "AITER tkw1 kernel does not support `num_local_tokens`"
        )

        return rocm_aiter_ops.asm_moe_tkw1(
            hidden_states,
            w1,
            w2,
            topk_weights,
            topk_ids,
            fc1_scale=quant_config.w1_scale,
            fc2_scale=quant_config.w2_scale,
            fc1_smooth_scale=None,
            fc2_smooth_scale=None,
            a16=False,
            per_tensor_quant_scale=None,
            expert_mask=expert_mask,
            activation_method=activation_method,
        )

    else:
        quant_method = QuantMethod.NO.value
        # mxfp4 i.e. w4a4, w4a16 uses BLOCK_1X32
        # mxfp6 and mxfp8 are unsupported in AITER currently and use emulation instead
        if quant_config.use_mxfp4_w4a4 or quant_config.use_mxfp4_w4a16:
            quant_method = QuantMethod.BLOCK_1X32.value
        # w8a8 block-scaled
        if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
            assert not apply_router_weight_on_input, (
                "apply_router_weight_on_input is not supported for block scaled moe"
            )
            assert quant_config.w1_scale is not None
            assert quant_config.w2_scale is not None
            quant_method = QuantMethod.BLOCK_128x128.value
        elif quant_config.use_fp8_w8a8 and quant_config.per_out_ch_quant:
            quant_method = QuantMethod.PER_TOKEN.value
        elif quant_config.use_fp8_w8a8:
            # Currently only per tensor quantization method is enabled.
            quant_method = QuantMethod.PER_TENSOR.value

        if apply_router_weight_on_input:
            assert topk_weights.dim() == 2, (
                "`topk_weights` should be in shape (num_tokens, topk)"
            )
            _, topk = topk_weights.shape
            assert topk == 1, (
                "Only support topk=1 when `apply_router_weight_on_input` is True"
            )

        # Compute padding on-the-fly for CK MXFP4 kernels
        hidden_pad = 0
        intermediate_pad = 0
        assert moe_config.hidden_dim_unpadded is not None
        assert moe_config.intermediate_size_per_partition_unpadded is not None
        hidden_pad = hidden_states.shape[1] - moe_config.hidden_dim_unpadded
        intermediate_pad = (
            (
                moe_config.intermediate_size_per_partition
                - moe_config.intermediate_size_per_partition_unpadded
            )
            if moe_config.intermediate_pad is None
            else moe_config.intermediate_pad
        )

        # Round hidden_pad/intermediate_pad to match AITER's CK/FlyDSL MoE
        # dispatch (currently pinned to v0.1.13.post1):
        # https://github.com/ROCm/aiter/blob/v0.1.13.post1/aiter/fused_moe.py#L1073
        # https://github.com/ROCm/aiter/blob/v0.1.13.post1/aiter/fused_moe.py#L1099
        # TODO: Revisit this once we bump AITER to 0.1.15 with padding fixes
        # for CK/FlyDSL MoE GEMM e.g. https://github.com/ROCm/aiter/pull/3401
        hidden_pad = hidden_pad // 128 * 128
        intermediate_pad = (
            intermediate_pad // 64 * 64 * (2 if moe_config.tp_size == 1 else 1)
        )

        # https://github.com/ROCm/aiter/pull/3123 specialized the AITER stage1 GEMMs
        # for interleaved vs separated gate and up weights.
        # For gpt-oss i.e. use_mxfp4_w4a16=True, the weights are shuffled by
        # `rocm_aiter_ops.shuffle_weight_a16w4` in `oracle/mxfp4.py`,
        # which always sets `is_guinterleave=True`.
        # Hence, we pass in GateMode.INTERLEAVE to match the weight shuffling.
        from aiter.ops.flydsl.moe_common import GateMode

        gate_mode = ""
        if quant_config.use_mxfp4_w4a16:
            gate_mode = GateMode.INTERLEAVE.value
        elif activation_interleave is not None:
            gate_mode = (
                GateMode.INTERLEAVE.value
                if activation_interleave
                else GateMode.SEPARATED.value
            )

        return rocm_aiter_ops.fused_moe(
            hidden_states,
            w1,
            w2,
            topk_weights,
            topk_ids,
            expert_mask=expert_mask,
            quant_method=quant_method,
            activation_method=activation_method,
            w1_scale=quant_config.w1_scale,
            w2_scale=quant_config.w2_scale,
            a1_scale=quant_config.a1_scale if a1q_scale is None else a1q_scale,
            a2_scale=quant_config.a2_scale,
            doweight_stage1=apply_router_weight_on_input,
            num_local_tokens=num_local_tokens,
            output_dtype=output_dtype,
            hidden_pad=hidden_pad,
            intermediate_pad=intermediate_pad,
            gate_mode=gate_mode,
            bias1=quant_config.w1_bias if quant_config.use_mxfp4_w4a16 else None,
            bias2=quant_config.w2_bias if quant_config.use_mxfp4_w4a16 else None,
            moe_sorting_dispatch_policy=moe_sorting_dispatch_policy,
        )

vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe ¶

inject_shared_expert_weights(topk_weights, topk_ids, topk, num_fused_shared_experts, shared_expert_weights=None) ¶

rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, moe_config, activation=MoEActivation.SILU, apply_router_weight_on_input=False, expert_map=None, quant_config=None, a1q_scale=None, num_local_tokens=None, output_dtype=None, moe_sorting_dispatch_policy=0) ¶

`vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe` ¶

`inject_shared_expert_weights(topk_weights, topk_ids, topk, num_fused_shared_experts, shared_expert_weights=None)` ¶

`rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, moe_config, activation=MoEActivation.SILU, apply_router_weight_on_input=False, expert_map=None, quant_config=None, a1q_scale=None, num_local_tokens=None, output_dtype=None, moe_sorting_dispatch_policy=0)` ¶