`vllm.model_executor.layers.fused_moe.oracle.int_wna16` ¶

Functions:

backend_to_kernel_cls –

Return the experts class for the given backend, or None for NONE.
convert_to_wna16_moe_kernel_format –

Dispatch weight post-processing to the appropriate per-backend handler.
make_wna16_moe_quant_config –

Create the FusedMoEQuantConfig for 4 or 8-bit WNA16 MoE.
map_wna16_backend –

Map user's MoEBackend to WNA16MoEBackend.
select_wna16_moe_backend –

Select the WNA16 MoE backend.

`_MoeWNA16HummingWeightSchema` ¶

Adapter from MoeWNA16's generic packed layout to Humming's layout.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

class _MoeWNA16HummingWeightSchema:
    """Adapter from MoeWNA16's generic packed layout to Humming's layout."""

    def __init__(self, bits: int, group_size: int, has_zero_point: bool) -> None:
        self.bits = bits
        self.group_size = group_size
        self.has_zero_point = has_zero_point

    def convert_humming(
        self,
        tensors: dict[str, torch.Tensor],
        shape_n_stacks: list[int],
        shape_k_stacks: list[int],
        param_dtype: torch.dtype,
        num_experts: int | None = None,
    ) -> tuple[Any, dict[str, torch.Tensor]]:
        del shape_n_stacks, shape_k_stacks, num_experts
        from vllm.utils.humming import HummingWeightSchema, dtypes

        output = _convert_moe_wna16_humming_tensors(
            tensors, has_zero_point=self.has_zero_point
        )
        output["weight_scale"] = output["weight_scale"].to(param_dtype)
        schema = HummingWeightSchema(
            b_dtype=dtypes.DataType.from_str(f"uint{self.bits}"),
            weight_scale_group_size=self.group_size,
            has_zero_point=self.has_zero_point,
        )
        return schema, output

`_convert_moe_wna16_humming_tensors(tensors, has_zero_point)` ¶

Convert MoeWNA16's N-first uint8 packing to Humming's int32 packing.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _convert_moe_wna16_humming_tensors(
    tensors: dict[str, torch.Tensor], has_zero_point: bool
) -> dict[str, torch.Tensor]:
    """Convert MoeWNA16's N-first uint8 packing to Humming's int32 packing."""
    if sys.byteorder != "little":
        raise NotImplementedError(
            "MoeWNA16 to Humming conversion requires a little-endian host."
        )

    output = {
        "weight": tensors["qweight"].contiguous().view(torch.int32),
        "weight_scale": tensors["scales"],
    }
    if has_zero_point:
        qzeros = tensors["qzeros"]
        output["zero_point"] = (
            qzeros.transpose(-1, -2)
            .contiguous()
            .view(torch.int32)
            .transpose(-1, -2)
            .contiguous()
        )
    return output

`_get_priority_backends()` ¶

Get available backends in priority order based on platform and config.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _get_priority_backends() -> list[WNA16MoEBackend]:
    """
    Get available backends in priority order based on platform and config.
    """
    if current_platform.is_cpu():
        return [WNA16MoEBackend.CPU]
    if current_platform.is_xpu():
        return [WNA16MoEBackend.XPU]

    return [
        WNA16MoEBackend.FLASHINFER_TRTLLM,
        WNA16MoEBackend.MARLIN,
        WNA16MoEBackend.BATCHED_MARLIN,
        WNA16MoEBackend.TRITON,
        WNA16MoEBackend.HUMMING,
        WNA16MoEBackend.EMULATION,
    ]

`_humming_wna16_weight_schema(quant_config)` ¶

Humming weight schema for a WNA16 checkpoint, derived from the quant config rather than the running kernel.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _humming_wna16_weight_schema(
    quant_config: QuantizationConfig | QuantizationArgs | None,
) -> dict[str, Any]:
    """Humming weight schema for a WNA16 checkpoint, derived from the quant
    config rather than the running kernel."""
    from vllm.model_executor.layers.quantization.auto_awq import AutoAWQConfig
    from vllm.model_executor.layers.quantization.auto_gptq import AutoGPTQConfig

    if isinstance(quant_config, AutoAWQConfig):
        return {
            "quant_method": "awq",
            "bits": quant_config.weight_bits,
            "group_size": quant_config.group_size,
            "zero_point": quant_config.zero_point,
        }
    if isinstance(quant_config, AutoGPTQConfig):
        return {
            "quant_method": "gptq",
            "bits": quant_config.weight_bits,
            "group_size": quant_config.group_size,
            "desc_act": quant_config.desc_act,
            "sym": quant_config.is_sym,
        }
    raise TypeError(
        "Humming WNA16 checkpoint schema requires AutoAWQConfig or "
        "AutoGPTQConfig, "
        f"got {type(quant_config).__name__}."
    )

`_pad_rows(x, padded_rows)` ¶

Zero-pad a (E, rows, cols) tensor to padded_rows rows.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _pad_rows(x: torch.Tensor, padded_rows: int) -> torch.Tensor:
    """Zero-pad a ``(E, rows, cols)`` tensor to ``padded_rows`` rows."""
    if padded_rows == x.size(1):
        return x
    return torch.nn.functional.pad(x, (0, 0, 0, padded_rows - x.size(1)))

`_pad_w13_bias(bias, n, padded_n)` ¶

Zero-pad each gate/up shard of a (E, 2 * n) bias to padded_n.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _pad_w13_bias(bias: torch.Tensor, n: int, padded_n: int) -> torch.Tensor:
    """Zero-pad each gate/up shard of a ``(E, 2 * n)`` bias to ``padded_n``."""
    if padded_n == n:
        return bias
    e = bias.size(0)
    bias = bias.view(e, 2, n)
    bias = torch.nn.functional.pad(bias, (0, padded_n - n))
    return bias.reshape(e, 2 * padded_n).contiguous()

`_pad_w13_shard_cols(x, unit, padded_unit)` ¶

Zero-pad each of the two gate/up shards of a (E, rows, 2 * unit) tensor along its last dim, from unit to padded_unit columns.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _pad_w13_shard_cols(x: torch.Tensor, unit: int, padded_unit: int) -> torch.Tensor:
    """Zero-pad each of the two gate/up shards of a ``(E, rows, 2 * unit)``
    tensor along its last dim, from ``unit`` to ``padded_unit`` columns."""
    if padded_unit == unit:
        return x
    e, rows, _ = x.shape
    x = x.view(e, rows, 2, unit)
    x = torch.nn.functional.pad(x, (0, padded_unit - unit))
    return x.reshape(e, rows, 2 * padded_unit).contiguous()

`_process_awq_weights_marlin(layer, weight_bits, pack_factor, group_size, input_dtype, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_qzeros, w2_qzeros, w13_bias=None, w2_bias=None)` ¶

AWQ-specific Marlin weight post-processing.

AWQ checkpoints use a different packing order than GPTQ, so they need AWQ-specific weight repacking and zero-point conversion before Marlin runs.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_awq_weights_marlin(
    layer: torch.nn.Module,
    weight_bits: int,
    pack_factor: int,
    group_size: int,
    input_dtype: torch.dtype | None,
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_qzeros: torch.Tensor,
    w2_qzeros: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor | None,  # w13_g_idx
    torch.Tensor | None,  # w2_g_idx
    torch.Tensor | None,  # w13_g_idx_sort_indices
    torch.Tensor | None,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """AWQ-specific Marlin weight post-processing.

    AWQ checkpoints use a different packing order than GPTQ, so they need
    AWQ-specific weight repacking and zero-point conversion before Marlin runs.
    """
    num_experts = w13_qweight.shape[0]
    device = w13_qweight.device
    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
    w13_input_global_scale: torch.Tensor | None = None
    w2_input_global_scale: torch.Tensor | None = None
    w13_bias_out: torch.Tensor | None = None
    w2_bias_out: torch.Tensor | None = None

    if input_dtype == torch.float8_e4m3fn:
        ops.marlin_int4_fp8_preprocess(
            w13_qweight.view(-1, w13_qweight.size(2)),
            w13_qzeros.view(-1, w13_qzeros.size(2)),
            inplace=True,
        )
        ops.marlin_int4_fp8_preprocess(
            w2_qweight.view(-1, w2_qweight.size(2)),
            w2_qzeros.view(-1, w2_qzeros.size(2)),
            inplace=True,
        )
        w13_scales = w13_scales.data * 512
        w2_scales = w2_scales.data * 512

    # --- Pad the intermediate size to a valid Marlin thread tile ---
    # AWQ packs along N: w13's N is in the (shard) columns, w2's N in the rows.
    N = layer.intermediate_size_per_partition
    padded_N = marlin_moe_padded_intermediate(N, group_size)
    if padded_N != N:
        w13_qweight = _pad_w13_shard_cols(
            w13_qweight, N // pack_factor, padded_N // pack_factor
        )
        w2_qweight = _pad_rows(w2_qweight, padded_N)
        w13_scales = _pad_w13_shard_cols(w13_scales, N, padded_N)
        w13_qzeros = _pad_w13_shard_cols(
            w13_qzeros, N // pack_factor, padded_N // pack_factor
        )
        if group_size > 0:
            w2_scales = _pad_rows(w2_scales, padded_N // group_size)
            w2_qzeros = _pad_rows(w2_qzeros, padded_N // group_size)
        if w13_bias is not None:
            w13_bias = _pad_w13_bias(w13_bias, N, padded_N)

    w13_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty((num_experts, 0), dtype=torch.int32, device=device),
        requires_grad=False,
    )
    w2_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty((num_experts, 0), dtype=torch.int32, device=device),
        requires_grad=False,
    )

    marlin_w13_qweight = ops.awq_marlin_moe_repack(
        w13_qweight,
        w13_g_idx_sort_indices,
        size_k=w13_qweight.shape[1],
        size_n=w13_qweight.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )
    marlin_w2_qweight = ops.awq_marlin_moe_repack(
        w2_qweight,
        w2_g_idx_sort_indices,
        size_k=w2_qweight.shape[1],
        size_n=w2_qweight.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )

    marlin_w13_scales = marlin_moe_permute_scales(
        s=w13_scales,
        size_k=layer.intermediate_size_per_partition,
        size_n=w13_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )
    if input_dtype == torch.int8 and layer.num_groups_w13 > 1:
        marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
            marlin_w13_scales
        )

    marlin_w2_scales = marlin_moe_permute_scales(
        s=w2_scales,
        size_k=layer.intermediate_size_per_partition,
        size_n=w2_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )
    if input_dtype == torch.int8 and layer.num_groups_w2 > 1:
        marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
            marlin_w2_scales
        )

    marlin_w13_qzeros = moe_awq_to_marlin_zero_points(
        w13_qzeros,
        size_k=w13_qzeros.shape[1],
        size_n=w13_qzeros.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )
    marlin_w2_qzeros = moe_awq_to_marlin_zero_points(
        w2_qzeros,
        size_k=w2_qzeros.shape[1],
        size_n=w2_qzeros.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )

    if w13_bias is not None:
        w13_bias_out = marlin_permute_bias(w13_bias)
    if w2_bias is not None:
        w2_bias_out = marlin_permute_bias(w2_bias)

    return (
        marlin_w13_qweight,
        marlin_w2_qweight,
        marlin_w13_scales,
        marlin_w2_scales,
        None,
        None,
        w13_g_idx_sort_indices,
        w2_g_idx_sort_indices,
        marlin_w13_qzeros,
        marlin_w2_qzeros,
        w13_input_global_scale,
        w2_input_global_scale,
        w13_bias_out,
        w2_bias_out,
    )

`_process_weights_cpu(quant_config, w13, w2, w13_scale, w2_scale, w13_g_idx=None, w2_g_idx=None, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None)` ¶

CPU INT4 W4A16 weight post-processing.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_cpu(
    quant_config: QuantizationConfig | QuantizationArgs | None,
    w13: torch.Tensor,
    w2: torch.Tensor,
    w13_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    w13_g_idx: torch.Tensor | None = None,
    w2_g_idx: torch.Tensor | None = None,
    w13_qzeros: torch.Tensor | None = None,
    w2_qzeros: torch.Tensor | None = None,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor | None,  # w13_g_idx
    torch.Tensor | None,  # w2_g_idx
    torch.Tensor | None,  # w13_g_idx_sort_indices
    torch.Tensor | None,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """CPU INT4 W4A16 weight post-processing."""
    from vllm.model_executor.layers.fused_moe.experts.cpu_moe import (
        prepare_int4_moe_layer_for_cpu,
    )
    from vllm.model_executor.layers.quantization.auto_awq import (
        AutoAWQConfig,
    )
    from vllm.model_executor.layers.quantization.auto_gptq import (
        AutoGPTQConfig,
    )

    # Detect packing format.
    # AWQ: qweight is [E, K, 2*N//8] (packed along output/N dim).
    # GPTQ: qweight is [E, K//8, 2*N] (packed along input/K dim).
    # compressed-tensors: qweight is [E, K//8, 2*N] (packed along input/K dim).
    if isinstance(quant_config, AutoAWQConfig):
        # AWQ: K is stored unpacked in dim 1.
        cpu_quant_algo = ops.CPUQuantAlgo.AWQ
    elif isinstance(quant_config, (AutoGPTQConfig, QuantizationArgs)):
        # GPTQ / compressed-tensors: K//8 is stored packed in dim 1.
        if isinstance(quant_config, AutoGPTQConfig) and quant_config.desc_act:
            raise NotImplementedError(
                "CPU WNA16 MoE backend does not support GPTQ with "
                "desc_act=True. The fused MoE kernel has no g_idx "
                "reordering support."
            )
        cpu_quant_algo = ops.CPUQuantAlgo.GPTQ
    else:
        raise TypeError(
            "CPU WNA16 MoE backend requires AutoAWQConfig, AutoGPTQConfig "
            f"or QuantizationArgs, got {type(quant_config).__name__}."
        )

    # Determine zero points for repacking.
    w13_zeros: torch.Tensor | None = None
    w2_zeros: torch.Tensor | None = None
    if w13_qzeros is not None:
        w13_zeros = (
            w13_qzeros.data.view(torch.int32)
            if w13_qzeros.dtype != torch.int32
            else w13_qzeros.data
        )
    if w2_qzeros is not None:
        w2_zeros = (
            w2_qzeros.data.view(torch.int32)
            if w2_qzeros.dtype != torch.int32
            else w2_qzeros.data
        )

    (
        blocked_w13,
        blocked_w2,
        blocked_s13,
        blocked_s2,
        blocked_z13,
        blocked_z2,
    ) = prepare_int4_moe_layer_for_cpu(
        w13,
        w2,
        w13_scale,
        w2_scale,
        quant_algo=cpu_quant_algo,
        w13_zeros=w13_zeros,
        w2_zeros=w2_zeros,
    )
    return (
        blocked_w13,
        blocked_w2,
        blocked_s13,
        blocked_s2,
        w13_g_idx,
        w2_g_idx,
        None,  # w13_g_idx_sort_indices (unused on CPU)
        None,  # w2_g_idx_sort_indices (unused on CPU)
        blocked_z13,
        blocked_z2,
        None,  # w13_input_global_scale
        None,  # w2_input_global_scale
        w13_bias.to(torch.float32) if w13_bias is not None else None,
        w2_bias.to(torch.float32) if w2_bias is not None else None,
    )

`_process_weights_emulation_awq(w13, w2, w13_scale, w2_scale, w13_qzeros, w2_qzeros)` ¶

Dequantize AWQ int4 weights to BF16 for the emulation backend.

AWQ inputs

w13: [E, K, 2N//8] int32 (packed along N, gate+up on dim 2) w2: [E, N, K//8] int32 (packed along K) w13_scale: [E, K//gs, 2N] float16 w2_scale: [E, N//gs, K] float16

Outputs (what TritonExperts expects): w13_out: [E, 2*N, K] bfloat16 w2_out: [E, K, N] bfloat16

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_emulation_awq(
    w13: torch.Tensor,
    w2: torch.Tensor,
    w13_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    w13_qzeros: torch.Tensor | None,
    w2_qzeros: torch.Tensor | None,
) -> tuple:
    """Dequantize AWQ int4 weights to BF16 for the emulation backend.

    AWQ inputs:
        w13: [E, K, 2*N//8]       int32  (packed along N, gate+up on dim 2)
        w2:  [E, N, K//8]         int32  (packed along K)
        w13_scale: [E, K//gs, 2*N]  float16
        w2_scale:  [E, N//gs, K]    float16

    Outputs (what TritonExperts expects):
        w13_out: [E, 2*N, K]  bfloat16
        w2_out:  [E, K, N]    bfloat16
    """
    # w13: AWQ-packed along N (dim 2), K is unpacked in dim 1
    # _unpack_and_dequant_int4_awq with transpose_output=True yields [E, 2*N, K]
    w13_bf16 = _unpack_and_dequant_int4_awq(
        w13, w13_scale, w13_qzeros, transpose_output=True
    )

    # w2: AWQ packs along K (dim 2 is K//8), N is unpacked in dim 1.
    # AWQ w2 is [E, N, K//8] — same column-pack format applied to the K dim.
    # _unpack_and_dequant_int4_awq expects [E, rows, N_packed] where the
    # packed dim is columns. Treat dim 1 as rows and dim 2 as N_packed:
    # unpacking gives [E, N, K]. Then permute to [E, K, N].
    w2_unpacked = _unpack_and_dequant_int4_awq(
        w2, w2_scale, w2_qzeros, transpose_output=False
    )  # [E, N, K]
    w2_bf16 = w2_unpacked.permute(0, 2, 1).contiguous()  # [E, K, N]

    dummy = torch.ones(1, dtype=torch.float16, device=w13.device)
    return (
        w13_bf16,
        w2_bf16,
        dummy,
        dummy,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
    )

`_process_weights_emulation_gptq(w13, w2, w13_scale, w2_scale, w13_qzeros, w2_qzeros)` ¶

Dequantize int4 weights to BF16 for the emulation backend.

Inputs are in GPTQ packed format

w13: [E, K//8, 2N] int32 (gate+up proj stacked on dim 2) w2: [E, N//8, K] int32 w13_scale: [E, K//gs, 2N] float16 w2_scale: [E, N//gs, K] float16

Outputs (what TritonExperts expects): w13_out: [E, 2*N, K] bfloat16 w2_out: [E, K, N] bfloat16

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_emulation_gptq(
    w13: torch.Tensor,
    w2: torch.Tensor,
    w13_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    w13_qzeros: torch.Tensor | None,
    w2_qzeros: torch.Tensor | None,
) -> tuple:
    """Dequantize int4 weights to BF16 for the emulation backend.

    Inputs are in GPTQ packed format:
        w13: [E, K//8, 2*N]   int32  (gate+up proj stacked on dim 2)
        w2:  [E, N//8, K]     int32
        w13_scale: [E, K//gs, 2*N]  float16
        w2_scale:  [E, N//gs, K]    float16

    Outputs (what TritonExperts expects):
        w13_out: [E, 2*N, K]  bfloat16
        w2_out:  [E, K, N]    bfloat16
    """
    # w13: packed along K (dim 1), output cols are 2*N (dim 2)
    # transpose_output=True yields [E, 2*N, K]
    w13_bf16 = _unpack_and_dequant_int4_gptq(
        w13, w13_scale, w13_qzeros, transpose_output=True
    )

    # w2: packed along N (dim 1 is N//8), output cols are K (dim 2)
    # After unpacking we get [E, N, K]; we want [E, K, N] for TritonExperts
    # transpose_output=False gives [E, N, K], then we permute once more
    w2_unpacked = _unpack_and_dequant_int4_gptq(
        w2, w2_scale, w2_qzeros, transpose_output=False
    )  # [E, N, K]
    w2_bf16 = w2_unpacked.permute(0, 2, 1).contiguous()  # [E, K, N]

    dummy = torch.ones(1, dtype=torch.float16, device=w13.device)
    return (
        w13_bf16,  # w13_qweight  (now bf16, not int32)
        w2_bf16,  # w2_qweight   (now bf16, not int32)
        dummy,  # w13_scales   (unused; nulled out in Int4EmulationTritonExperts)
        dummy,  # w2_scales    (unused)
        None,  # w13_g_idx
        None,  # w2_g_idx
        None,  # w13_g_idx_sort_indices
        None,  # w2_g_idx_sort_indices
        None,  # w13_qzeros
        None,  # w2_qzeros
        None,  # w13_input_global_scale
        None,  # w2_input_global_scale
        None,  # w13_bias
        None,  # w2_bias
    )

`_process_weights_flashinfer(w13_qweight, w2_qweight, w13_scales, w2_scales, w13_g_idx, w2_g_idx, w13_bias=None, w2_bias=None)` ¶

Flashinfer (TRT-LLM MXINT4) weight post-processing.

Steps¶

Transform weights/scales via prepare_static_weights_for_trtllm_mxint4_moe.
Return transformed tensors, passing through g_idx/bias unchanged.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_flashinfer(
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_g_idx: torch.Tensor,
    w2_g_idx: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor,  # w13_g_idx
    torch.Tensor,  # w2_g_idx
    torch.Tensor | None,  # w13_g_idx_sort_indices
    torch.Tensor | None,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """Flashinfer (TRT-LLM MXINT4) weight post-processing.

    Steps
    -----
    1. Transform weights/scales via ``prepare_static_weights_for_trtllm_mxint4_moe``.
    2. Return transformed tensors, passing through g_idx/bias unchanged.
    """
    from vllm.model_executor.layers.quantization.utils.flashinfer_mxint4_moe import (
        prepare_static_weights_for_trtllm_mxint4_moe,
    )

    dict_weights_mxint4 = prepare_static_weights_for_trtllm_mxint4_moe(
        w13_qweight,
        w13_scales,
        w2_qweight,
        w2_scales,
    )

    return (
        dict_weights_mxint4["gemm1_weights"],
        dict_weights_mxint4["gemm2_weights"],
        dict_weights_mxint4["gemm1_scales"],
        dict_weights_mxint4["gemm2_scales"],
        w13_g_idx,
        w2_g_idx,
        None,
        None,
        None,
        None,
        None,
        None,
        w13_bias,
        w2_bias,
    )

`_process_weights_marlin(layer, input_dtype, num_bits, pack_factor, group_size, actorder, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_g_idx, w2_g_idx, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None)` ¶

Standard Marlin weight post-processing shared by MARLIN and BATCHED_MARLIN backends.

Steps¶

Optional FP8 preprocessing of packed weights / scales.
Sort / reset g_idx tensors for act-order handling.
Repack weights via gptq_marlin_moe_repack.
Permute scales (and optionally extract INT8 global scales).
Permute bias tensors.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_marlin(
    layer: torch.nn.Module,
    input_dtype: torch.dtype | None,
    num_bits: int,
    pack_factor: int,
    group_size: int,
    actorder: str | None,
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_g_idx: torch.Tensor,
    w2_g_idx: torch.Tensor,
    w13_qzeros: torch.Tensor | None = None,
    w2_qzeros: torch.Tensor | None = None,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor,  # w13_g_idx
    torch.Tensor,  # w2_g_idx
    torch.Tensor,  # w13_g_idx_sort_indices
    torch.Tensor,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """Standard Marlin weight post-processing shared by MARLIN and
    BATCHED_MARLIN backends.

    Steps
    -----
    1. Optional FP8 preprocessing of packed weights / scales.
    2. Sort / reset g_idx tensors for act-order handling.
    3. Repack weights via ``gptq_marlin_moe_repack``.
    4. Permute scales (and optionally extract INT8 global scales).
    5. Permute bias tensors.
    """
    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1

    marlin_w13_qweight: torch.Tensor
    marlin_w2_qweight: torch.Tensor
    marlin_w13_scales: torch.Tensor
    marlin_w2_scales: torch.Tensor
    w13_g_idx_sort_indices: torch.Tensor | None = None
    w2_g_idx_sort_indices: torch.Tensor | None = None
    w13_input_global_scale: torch.Tensor | None = None
    w2_input_global_scale: torch.Tensor | None = None
    w13_bias_out: torch.Tensor | None = None
    w2_bias_out: torch.Tensor | None = None

    # --- FP8 weight / scale adjustment ---
    if input_dtype == torch.float8_e4m3fn:
        # NOTE: for non-zp quantization format only
        marlin_w13_qweight = ops.marlin_int4_fp8_preprocess(w13_qweight, inplace=False)
        marlin_w2_qweight = ops.marlin_int4_fp8_preprocess(w2_qweight, inplace=False)
        marlin_w13_scales = w13_scales.data * 512
        marlin_w2_scales = w2_scales.data * 512
    else:
        marlin_w13_qweight = w13_qweight
        marlin_w2_qweight = w2_qweight
        marlin_w13_scales = w13_scales
        marlin_w2_scales = w2_scales

    # --- Pad the intermediate size to a valid Marlin thread tile ---
    # GPTQ packs along K: w13's N is in the (shard) columns, w2's N in the rows.
    # Act-order keeps the strict shape and is never padded.
    N = layer.intermediate_size_per_partition
    padded_N = marlin_moe_padded_intermediate(N, group_size)
    if padded_N != N:
        assert actorder != "group", (
            "Marlin MoE thread-tile padding is unsupported with act-order"
        )
        marlin_w13_qweight = _pad_w13_shard_cols(marlin_w13_qweight, N, padded_N)
        marlin_w2_qweight = _pad_rows(marlin_w2_qweight, padded_N // pack_factor)
        marlin_w13_scales = _pad_w13_shard_cols(marlin_w13_scales, N, padded_N)
        if group_size > 0:
            marlin_w2_scales = _pad_rows(marlin_w2_scales, padded_N // group_size)
        if w13_qzeros is not None:
            w13_qzeros = _pad_w13_shard_cols(
                w13_qzeros, N // pack_factor, padded_N // pack_factor
            )
        if w2_qzeros is not None and group_size > 0:
            w2_qzeros = _pad_rows(w2_qzeros, padded_N // group_size)
        if w13_bias is not None:
            w13_bias = _pad_w13_bias(w13_bias, N, padded_N)

    # --- Process act_order (g_idx) ---
    if actorder == "group":
        num_experts = w13_g_idx.shape[0]
        w13_g_idx_sort_indices = torch.empty_like(w13_g_idx)
        w2_g_idx_sort_indices = torch.empty_like(w2_g_idx)
        w13_sorted_g_idx = torch.empty_like(w13_g_idx)
        w2_sorted_g_idx = torch.empty_like(w2_g_idx)
        for e in range(num_experts):
            w13_g_idx_sort_indices[e] = torch.argsort(w13_g_idx[e]).to(torch.int32)
            w2_g_idx_sort_indices[e] = torch.argsort(w2_g_idx[e]).to(torch.int32)
            w13_sorted_g_idx[e] = w13_g_idx[e][w13_g_idx_sort_indices[e]]
            w2_sorted_g_idx[e] = w2_g_idx[e][w2_g_idx_sort_indices[e]]
        w13_g_idx = w13_sorted_g_idx
        w2_g_idx = w2_sorted_g_idx
    else:
        num_experts = w13_g_idx.shape[0]
        device = w13_g_idx.device
        w13_g_idx = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        w2_g_idx = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        w13_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        w2_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )

    # --- Repack weights ---
    marlin_w13_qweight = ops.gptq_marlin_moe_repack(
        marlin_w13_qweight,
        w13_g_idx_sort_indices,
        marlin_w13_qweight.shape[1] * pack_factor,
        marlin_w13_qweight.shape[2],
        num_bits,
        is_a_8bit=is_a_8bit,
    )
    marlin_w2_qweight = ops.gptq_marlin_moe_repack(
        marlin_w2_qweight,
        w2_g_idx_sort_indices,
        marlin_w2_qweight.shape[1] * pack_factor,
        marlin_w2_qweight.shape[2],
        num_bits,
        is_a_8bit=is_a_8bit,
    )

    # --- Permute scales ---
    marlin_w13_scales = marlin_moe_permute_scales(
        s=marlin_w13_scales,
        size_k=layer.intermediate_size_per_partition,
        size_n=marlin_w13_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )
    group_size_or_pack_factor = group_size if group_size != -1 else pack_factor
    marlin_w2_scales = marlin_moe_permute_scales(
        s=marlin_w2_scales,
        size_k=marlin_w2_scales.shape[1] * group_size_or_pack_factor,
        size_n=marlin_w2_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )

    if input_dtype == torch.int8:
        if layer.num_groups_w13 > 1:
            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
                marlin_w13_scales
            )
        if layer.num_groups_w2 > 1:
            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
                marlin_w2_scales
            )

    # --- Permute zero points ---
    if w13_qzeros is not None and w2_qzeros is not None:
        w13_qzeros = moe_packed_to_marlin_zero_points(
            w13_qzeros,
            size_k=w13_qzeros.shape[1],
            size_n=w13_qzeros.shape[2] * pack_factor,
            num_bits=num_bits,
            is_a_8bit=is_a_8bit,
        )
        w2_qzeros = moe_packed_to_marlin_zero_points(
            w2_qzeros,
            size_k=w2_qzeros.shape[1],
            size_n=w2_qzeros.shape[2] * pack_factor,
            num_bits=num_bits,
            is_a_8bit=is_a_8bit,
        )

    # --- Permute bias ---
    if w13_bias is not None:
        w13_bias_out = marlin_permute_bias(w13_bias)
    if w2_bias is not None:
        w2_bias_out = marlin_permute_bias(w2_bias)

    return (
        marlin_w13_qweight,
        marlin_w2_qweight,
        marlin_w13_scales,
        marlin_w2_scales,
        w13_g_idx,
        w2_g_idx,
        w13_g_idx_sort_indices,
        w2_g_idx_sort_indices,
        w13_qzeros,
        w2_qzeros,
        w13_input_global_scale,
        w2_input_global_scale,
        w13_bias_out,
        w2_bias_out,
    )

`_process_weights_xpu(layer, quant_config, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_bias=None, w2_bias=None)` ¶

Repack GPTQ-format INT4 MoE weights into the layout vllm_xpu_kernels.fused_moe_interface.xpu_fused_moe(is_int4=True) expects:

w13: [E, 2*N, K] int4 (uint8 storage [E, 2*N, K // 2])
w13_scales: [E, 2*N, K // group_size] params_dtype
w2:  [E, K, N]   int4 (uint8 storage [E, K, N // 2])
w2_scales:  [E, K, N // group_size]   params_dtype

Input GPTQ layout from FusedMoE.weight_loader: w13: [E, K // 8, 2N] int32 (8 nibbles per int32 along the input dim) w13_scales: [E, K // group_size, 2N] params_dtype w2: [E, N // 8, K] int32 w2_scales: [E, N // group_size, K] params_dtype

Transpose dim 1 ↔ dim 2 then view int32 → uint8 to recover sequential int4-packed bytes along the input dim. Each packed int32 holds 8 nibbles (n7<<28)|(n6<<24)|...|(n1<<4)|n0 in ascending K order; on a little-endian host the int32→uint8 view exposes them as bytes [n1<<4|n0, n3<<4|n2, n5<<4|n4, n7<<4|n6], i.e. two nibbles per byte with the lower nibble = lower input-K index. xpu_fused_moe(is_int4=True) expects this convention; on a big-endian host the byte order reverses and the kernel would silently miscompute, so we hard-fail.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_xpu(
    layer: torch.nn.Module,
    quant_config: QuantizationConfig,
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """Repack GPTQ-format INT4 MoE weights into the layout
    `vllm_xpu_kernels.fused_moe_interface.xpu_fused_moe(is_int4=True)` expects:

        w13: [E, 2*N, K] int4 (uint8 storage [E, 2*N, K // 2])
        w13_scales: [E, 2*N, K // group_size] params_dtype
        w2:  [E, K, N]   int4 (uint8 storage [E, K, N // 2])
        w2_scales:  [E, K, N // group_size]   params_dtype

    Input GPTQ layout from FusedMoE.weight_loader:
        w13: [E, K // 8, 2*N] int32 (8 nibbles per int32 along the input dim)
        w13_scales: [E, K // group_size, 2*N] params_dtype
        w2:  [E, N // 8, K] int32
        w2_scales:  [E, N // group_size, K] params_dtype

    Transpose dim 1 ↔ dim 2 then view int32 → uint8 to recover sequential
    int4-packed bytes along the input dim. Each packed int32 holds 8 nibbles
    `(n7<<28)|(n6<<24)|...|(n1<<4)|n0` in ascending K order; on a
    little-endian host the int32→uint8 view exposes them as bytes
    `[n1<<4|n0, n3<<4|n2, n5<<4|n4, n7<<4|n6]`, i.e. two nibbles per byte
    with the lower nibble = lower input-K index. xpu_fused_moe(is_int4=True)
    expects this convention; on a big-endian host the byte order reverses
    and the kernel would silently miscompute, so we hard-fail.
    """
    del layer, quant_config  # unused — kept for parity with the marlin helper

    if sys.byteorder != "little":
        raise NotImplementedError(
            "_process_weights_xpu requires a little-endian host: the GPTQ "
            "int32 → uint8 nibble repack relies on LE byte ordering."
        )

    w13_xpu = w13_qweight.transpose(1, 2).contiguous().view(torch.uint8)
    w2_xpu = w2_qweight.transpose(1, 2).contiguous().view(torch.uint8)
    w13_scales_xpu = w13_scales.transpose(1, 2).contiguous()
    w2_scales_xpu = w2_scales.transpose(1, 2).contiguous()

    return (
        w13_xpu,
        w2_xpu,
        w13_scales_xpu,
        w2_scales_xpu,
        w13_bias,
        w2_bias,
    )

`_unpack_and_dequant_int4_awq(w_int32, scale, qzeros, transpose_output, output_dtype=torch.bfloat16)` ¶

Unpack AWQ-packed int4 weights and dequantize to output_dtype.

AWQ packs along the N (column) dimension with an interleave permutation [0,2,4,6,1,3,5,7] applied before packing, so unpacking must undo that.

Parameters:

w_int32 ¶
(Tensor) –

packed weights, shape [E, K, N_packed] where N_packed = N//8 (8 nibbles per int32, packed along N with AWQ interleaving).
scale ¶
(Tensor) –

per-group scales, shape [E, K//group_size, N], float16.
qzeros ¶
(Tensor | None) –

asymmetric zero-points, shape [E, K//gs, N_packed], int32. None for symmetric (uint4b8 with implicit bias 8).
transpose_output ¶
(bool) –

if True return [E, N, K]; if False return [E, K, N].
output_dtype ¶
(dtype, default: bfloat16 ) –

target floating-point dtype (bfloat16 or float16).

Returns:

Tensor –

Dequantized weight tensor in the requested layout.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _unpack_and_dequant_int4_awq(
    w_int32: torch.Tensor,
    scale: torch.Tensor,
    qzeros: torch.Tensor | None,
    transpose_output: bool,
    output_dtype: torch.dtype = torch.bfloat16,
) -> torch.Tensor:
    """Unpack AWQ-packed int4 weights and dequantize to output_dtype.

    AWQ packs along the N (column) dimension with an interleave permutation
    [0,2,4,6,1,3,5,7] applied before packing, so unpacking must undo that.

    Args:
        w_int32: packed weights, shape [E, K, N_packed] where N_packed = N//8
                 (8 nibbles per int32, packed along N with AWQ interleaving).
        scale:   per-group scales, shape [E, K//group_size, N], float16.
        qzeros:  asymmetric zero-points, shape [E, K//gs, N_packed], int32.
                 None for symmetric (uint4b8 with implicit bias 8).
        transpose_output: if True return [E, N, K]; if False return [E, K, N].
        output_dtype: target floating-point dtype (bfloat16 or float16).

    Returns:
        Dequantized weight tensor in the requested layout.
    """
    E, K, N_packed = w_int32.shape
    N = N_packed * 8

    # Unpack 8 nibbles per int32 along the N dimension (LSB-first)
    shifts = torch.arange(8, device=w_int32.device, dtype=torch.int32) * 4
    # [E, K, N_packed, 8] -> [E, K, N_packed*8] = [E, K, N_interleaved]
    nibbles = (w_int32.unsqueeze(-1) >> shifts) & 0xF
    w_interleaved = nibbles.reshape(E, K, N)  # [E, K, N] but column-interleaved

    # Undo AWQ interleave: packed order is [0,2,4,6,1,3,5,7] within each group
    # of 8. Inverse: position i in packed -> original column interleave[i].
    # To reverse: we need the inverse permutation so that
    # w[:, :, inv_interleave] = w_interleaved gives the natural column order.
    interleave = torch.tensor([0, 2, 4, 6, 1, 3, 5, 7], device=w_int32.device)
    inv_interleave = torch.empty_like(interleave)
    inv_interleave[interleave] = torch.arange(8, device=w_int32.device)

    # Apply inverse interleave within each group of 8 columns
    w_reshaped = w_interleaved.reshape(E, K, N // 8, 8)  # [E, K, groups, 8]
    w_reordered = w_reshaped[:, :, :, inv_interleave]  # undo interleave
    w = w_reordered.reshape(E, K, N).to(torch.int16)  # [E, K, N]

    if qzeros is None:
        w = w - 8
    else:
        # qzeros: [E, K//gs, N_packed] int32, same AWQ column packing
        gs = K // scale.shape[1]
        n_gs = scale.shape[1]
        zp_nibbles = (qzeros.unsqueeze(-1) >> shifts) & 0xF  # [E, n_gs, N_packed, 8]
        zp_interleaved = zp_nibbles.reshape(E, n_gs, N)
        zp_reshaped = zp_interleaved.reshape(E, n_gs, N // 8, 8)
        zp_reordered = zp_reshaped[:, :, :, inv_interleave]
        zp = zp_reordered.reshape(E, n_gs, N).to(torch.int16)  # [E, n_gs, N]
        zp = zp.repeat_interleave(gs, dim=1)  # [E, K, N]
        w = w - zp

    gs = K // scale.shape[1]
    scale_broadcast = scale.repeat_interleave(gs, dim=1).to(output_dtype)  # [E, K, N]

    w_dequant = w.to(output_dtype) * scale_broadcast  # [E, K, N]

    if transpose_output:
        return w_dequant.permute(0, 2, 1).contiguous()  # [E, N, K]
    return w_dequant.contiguous()  # [E, K, N]

`_unpack_and_dequant_int4_gptq(w_int32, scale, qzeros, transpose_output, output_dtype=torch.bfloat16)` ¶

Unpack GPTQ-packed int4 weights and dequantize to output_dtype.

Parameters:

w_int32 ¶
(Tensor) –

packed weights, shape [E, K_packed, N] where K_packed = K//8 (8 nibbles per int32, LSB-first in the K dimension).
scale ¶
(Tensor) –

per-group scales, shape [E, K//group_size, N], float16.
qzeros ¶
(Tensor | None) –

optional asymmetric zero-points, shape [E, K//gs, N//8], int32. None for symmetric (uint4b8 with implicit bias 8).
transpose_output ¶
(bool) –

if True return [E, N, K]; if False return [E, K, N].
output_dtype ¶
(dtype, default: bfloat16 ) –

target floating-point dtype (bfloat16 or float16).

Returns:

Tensor –

Dequantized weight tensor in the requested layout.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _unpack_and_dequant_int4_gptq(
    w_int32: torch.Tensor,
    scale: torch.Tensor,
    qzeros: torch.Tensor | None,
    transpose_output: bool,
    output_dtype: torch.dtype = torch.bfloat16,
) -> torch.Tensor:
    """Unpack GPTQ-packed int4 weights and dequantize to output_dtype.

    Args:
        w_int32: packed weights, shape [E, K_packed, N] where K_packed = K//8
                 (8 nibbles per int32, LSB-first in the K dimension).
        scale:   per-group scales, shape [E, K//group_size, N], float16.
        qzeros:  optional asymmetric zero-points, shape [E, K//gs, N//8], int32.
                 None for symmetric (uint4b8 with implicit bias 8).
        transpose_output: if True return [E, N, K]; if False return [E, K, N].
        output_dtype: target floating-point dtype (bfloat16 or float16).

    Returns:
        Dequantized weight tensor in the requested layout.
    """
    E, K_packed, N = w_int32.shape
    K = K_packed * 8

    # Unpack: [E, K_packed, N] -> [E, K_packed, N, 8] via bit-shifts.
    # The nibble index (last dim) enumerates K rows within each packed column,
    # so we must fuse K_packed and the nibble dim, not N and the nibble dim.
    # Permute to [E, K_packed, 8, N] before reshaping to [E, K, N].
    shifts = torch.arange(8, device=w_int32.device, dtype=torch.int32) * 4
    nibbles = (w_int32.unsqueeze(-1) >> shifts) & 0xF  # [E, K_packed, N, 8]

    # Reshape to [E, K, N]: fuse K_packed and nibble index (dim 1 and 3)
    w = nibbles.permute(0, 1, 3, 2).reshape(E, K, N).to(torch.int16)

    if qzeros is None:
        # Symmetric uint4b8: subtract bias so the range is [-8, 7]
        w = w - 8
    else:
        # Asymmetric: unpack zero-points (same 8-nibble packing) and subtract
        # qzeros shape: [E, K//gs, N//8] int32
        gs = K // scale.shape[1]
        n_gs = scale.shape[1]
        zp_shifts = torch.arange(8, device=qzeros.device, dtype=torch.int32) * 4
        zp_nibbles = (qzeros.unsqueeze(-1) >> zp_shifts) & 0xF  # [E, n_gs, N//8, 8]
        zp = zp_nibbles.reshape(E, n_gs, N).to(torch.int16)  # [E, n_gs, N]
        zp = zp.repeat_interleave(gs, dim=1)  # [E, K, N]
        w = w - zp

    # Broadcast scale [E, K//gs, N] -> [E, K, N]
    gs = K // scale.shape[1]
    scale_broadcast = scale.repeat_interleave(gs, dim=1).to(output_dtype)

    w_dequant = w.to(output_dtype) * scale_broadcast  # [E, K, N]

    if transpose_output:
        return w_dequant.permute(0, 2, 1).contiguous()  # [E, N, K]
    return w_dequant.contiguous()  # [E, K, N]

`backend_to_kernel_cls(backend)` ¶

Return the experts class for the given backend, or None for NONE.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def backend_to_kernel_cls(
    backend: WNA16MoEBackend,
) -> list[type[mk.FusedMoEExperts]]:
    """Return the experts class for the given backend, or None for NONE."""
    if backend == WNA16MoEBackend.HUMMING:
        from vllm.model_executor.layers.fused_moe.experts.fused_humming_moe import (
            BatchedHummingGroupedExperts,
            HummingGroupedExperts,
            HummingIndexedExperts,
        )

        return [
            BatchedHummingGroupedExperts,
            HummingGroupedExperts,
            HummingIndexedExperts,
        ]
    elif backend == WNA16MoEBackend.MARLIN:
        return [MarlinExperts]
    elif backend == WNA16MoEBackend.BATCHED_MARLIN:
        return [BatchedMarlinExperts]
    elif backend == WNA16MoEBackend.FLASHINFER_TRTLLM:
        return [TrtLlmMxint4ExpertsMonolithic]
    elif backend == WNA16MoEBackend.TRITON:
        return [TritonWNA16Experts]
    elif backend == WNA16MoEBackend.XPU:
        from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
            XPUExpertsWNA16,
        )

        return [XPUExpertsWNA16]
    elif backend == WNA16MoEBackend.CPU:
        from vllm.model_executor.layers.fused_moe.experts.cpu_moe import (
            CPUExpertsInt4,
        )

        return [CPUExpertsInt4]
    elif backend == WNA16MoEBackend.EMULATION:
        from vllm.model_executor.layers.fused_moe.experts.int4_emulation_moe import (
            Int4EmulationTritonExperts,
        )

        return [Int4EmulationTritonExperts]
    else:
        raise ValueError(f"Unknown WNA16 MoE backend: {backend.value}")

`convert_to_wna16_moe_kernel_format(backend, layer, quant_config, input_dtype, w13, w2, w13_scale, w2_scale, w13_g_idx=None, w2_g_idx=None, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None)` ¶

Dispatch weight post-processing to the appropriate per-backend handler.

To add a new backend, implement a _process_weights_<name> helper and add a branch here. Backends that rewrite the layer's parameters in place (e.g. Humming) return None; the caller then skips the param scatter.

Parameters:

backend ¶
(WNA16MoEBackend) –

the selected WNA16MoEBackend.
layer ¶
(Module) –

the FusedMoE layer whose parameters are being prepared.
quant_config ¶
(QuantizationConfig | QuantizationArgs | None) –

the QuantizationConfig for this layer.
input_dtype ¶
(dtype | None) –

optional activation dtype, usually should be 16 bit.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def convert_to_wna16_moe_kernel_format(
    backend: WNA16MoEBackend,
    layer: torch.nn.Module,
    quant_config: QuantizationConfig | QuantizationArgs | None,
    input_dtype: torch.dtype | None,
    w13: torch.Tensor,
    w2: torch.Tensor,
    w13_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    w13_g_idx: torch.Tensor | None = None,
    w2_g_idx: torch.Tensor | None = None,
    w13_qzeros: torch.Tensor | None = None,
    w2_qzeros: torch.Tensor | None = None,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> (
    tuple[
        torch.Tensor,  # w13_qweight
        torch.Tensor,  # w2_qweight
        torch.Tensor,  # w13_scales
        torch.Tensor,  # w2_scales
        torch.Tensor | None,  # w13_g_idx
        torch.Tensor | None,  # w2_g_idx
        torch.Tensor | None,  # w13_g_idx_sort_indices
        torch.Tensor | None,  # w2_g_idx_sort_indices
        torch.Tensor | None,  # w13_qzeros
        torch.Tensor | None,  # w2_qzeros
        torch.Tensor | None,  # w13_input_global_scale
        torch.Tensor | None,  # w2_input_global_scale
        torch.Tensor | None,  # w13_bias
        torch.Tensor | None,  # w2_bias
    ]
    | None
):
    """Dispatch weight post-processing to the appropriate per-backend handler.

    To add a new backend, implement a ``_process_weights_<name>`` helper and
    add a branch here. Backends that rewrite the layer's parameters in place
    (e.g. Humming) return ``None``; the caller then skips the param scatter.

    Args:
        backend: the selected ``WNA16MoEBackend``.
        layer: the ``FusedMoE`` layer whose parameters are being prepared.
        quant_config: the ``QuantizationConfig`` for this layer.
        input_dtype: optional activation dtype, usually should be 16 bit.
    """
    if backend == WNA16MoEBackend.HUMMING:
        from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
        from vllm.model_executor.layers.quantization.utils.humming_utils import (
            convert_to_humming_moe_kernel_format,
        )

        if isinstance(quant_config, MoeWNA16Config):
            from vllm.utils.humming import HummingInputSchema

            convert_to_humming_moe_kernel_format(
                layer,
                weight_schema=_MoeWNA16HummingWeightSchema(
                    bits=quant_config.weight_bits,
                    group_size=layer.group_size,
                    has_zero_point=quant_config.has_zp,
                ),
                input_schema=HummingInputSchema(),
            )
        else:
            convert_to_humming_moe_kernel_format(
                layer, quant_config=_humming_wna16_weight_schema(quant_config)
            )
        return None

    if backend in (
        WNA16MoEBackend.MARLIN,
        WNA16MoEBackend.BATCHED_MARLIN,
    ):
        from vllm.model_executor.layers.quantization.auto_awq import (
            AutoAWQConfig,
        )
        from vllm.model_executor.layers.quantization.auto_gptq import (
            AutoGPTQConfig,
        )

        if isinstance(quant_config, AutoAWQConfig):
            num_bits = quant_config.weight_bits
            pack_factor = quant_config.pack_factor
            group_size = quant_config.group_size
        elif isinstance(quant_config, AutoGPTQConfig):
            num_bits = quant_config.quant_type.size_bits
            pack_factor = quant_config.pack_factor
            group_size = quant_config.group_size
            actorder = "group" if quant_config.desc_act else None
        elif isinstance(quant_config, QuantizationArgs):
            num_bits = quant_config.num_bits
            pack_factor = 32 // quant_config.num_bits
            group_size = quant_config.group_size
            actorder = quant_config.actorder
        else:
            raise TypeError(
                "Marlin WNA16 MoE backend requires AutoGPTQConfig, AutoAWQConfig or "
                f"QuantizationArgs, got {type(quant_config).__name__}."
            )

        if isinstance(quant_config, AutoAWQConfig):
            if w13_qzeros is None or w2_qzeros is None:
                raise ValueError("AWQ Marlin MoE requires zero-point tensors.")

            return _process_awq_weights_marlin(
                layer,
                num_bits,
                pack_factor,
                group_size,
                input_dtype,
                w13,
                w2,
                w13_scale,
                w2_scale,
                w13_qzeros,
                w2_qzeros,
                w13_bias,
                w2_bias,
            )
        else:
            if w13_g_idx is None or w2_g_idx is None:
                raise ValueError("GPTQ Marlin MoE requires g_idx tensors.")

            return _process_weights_marlin(
                layer,
                input_dtype,
                num_bits,
                pack_factor,
                group_size,
                actorder,
                w13,
                w2,
                w13_scale,
                w2_scale,
                w13_g_idx,
                w2_g_idx,
                w13_qzeros,
                w2_qzeros,
                w13_bias,
                w2_bias,
            )
    elif backend == WNA16MoEBackend.CPU:
        return _process_weights_cpu(
            quant_config,
            w13,
            w2,
            w13_scale,
            w2_scale,
            w13_g_idx,
            w2_g_idx,
            w13_qzeros,
            w2_qzeros,
            w13_bias,
            w2_bias,
        )
    elif backend == WNA16MoEBackend.FLASHINFER_TRTLLM:
        return _process_weights_flashinfer(
            w13,
            w2,
            w13_scale,
            w2_scale,
            w13_g_idx,
            w2_g_idx,
            w13_bias,
            w2_bias,
        )
    elif backend == WNA16MoEBackend.XPU:
        assert quant_config is not None
        (
            w13_xpu,
            w2_xpu,
            w13_scale_xpu,
            w2_scale_xpu,
            w13_bias_out,
            w2_bias_out,
        ) = _process_weights_xpu(
            layer,
            quant_config,
            w13,
            w2,
            w13_scale,
            w2_scale,
            w13_bias,
            w2_bias,
        )
        empty = torch.empty((0,), dtype=torch.int32, device=w13.device)
        return (
            w13_xpu,
            w2_xpu,
            w13_scale_xpu,
            w2_scale_xpu,
            empty,  # w13_g_idx
            empty,  # w2_g_idx
            empty,  # w13_g_idx_sort_indices
            empty,  # w2_g_idx_sort_indices
            None,  # w13_qzeros — sym int4 on XPU has none; kernel does uint4b8→s4
            None,  # w2_qzeros
            None,  # w13_input_global_scale
            None,  # w2_input_global_scale
            w13_bias_out,
            w2_bias_out,
        )
    elif backend == WNA16MoEBackend.EMULATION:
        from vllm.model_executor.layers.quantization.auto_awq import AutoAWQConfig

        if isinstance(quant_config, AutoAWQConfig):
            return _process_weights_emulation_awq(
                w13,
                w2,
                w13_scale,
                w2_scale,
                w13_qzeros,
                w2_qzeros,
            )
        return _process_weights_emulation_gptq(
            w13,
            w2,
            w13_scale,
            w2_scale,
            w13_qzeros,
            w2_qzeros,
        )
    elif backend == WNA16MoEBackend.TRITON:
        # Two possible input layouts depending on the quantization source:
        #
        # MoeWNA16 (uint8):              (E, N_out, K // bit8_pack)  — N-first
        #   → just view as uint8 (no-op)
        #
        # AutoGPTQ/compressed-tensors (int32, K-first):
        #   (E, K // pack32, N_out)
        #   → transpose to N-first, then view as uint8 to get
        #     (E, N_out, K // bit8_pack)  [int32 = 4 bytes → 4 uint8s]
        #   Scales: (E, K // gs, N_out) → transpose → (E, N_out, K // gs)
        from vllm.model_executor.layers.quantization.auto_gptq import (
            AutoGPTQConfig,
        )

        if isinstance(quant_config, (AutoGPTQConfig, QuantizationArgs)):
            # These integrations build in K-first format even when the Triton
            # backend is selected. Transpose to N-first first.
            w13_uint8 = w13.transpose(1, 2).contiguous().view(torch.uint8)
            w2_uint8 = w2.transpose(1, 2).contiguous().view(torch.uint8)
            w13_scale = w13_scale.transpose(1, 2).contiguous()
            w2_scale = w2_scale.transpose(1, 2).contiguous()
        else:
            # MoeWNA16 uses N-first uint8 weights and scales.
            w13_uint8 = w13.view(torch.uint8)
            w2_uint8 = w2.view(torch.uint8)
        return (
            w13_uint8,
            w2_uint8,
            w13_scale,
            w2_scale,
            None,
            None,
            None,
            None,
            w13_qzeros,
            w2_qzeros,
            None,
            None,
            w13_bias,
            w2_bias,
        )
    else:
        raise ValueError(f"Unsupported wna16 MoE backend: {backend.value}")

`make_wna16_moe_quant_config(w1_scale, w2_scale, group_size, num_bits, w1_zp=None, w2_zp=None, w1_bias=None, w2_bias=None, a1_gscale=None, a2_gscale=None, gemm1_clamp_limit=None, gemm1_alpha=None, gemm1_beta=None)` ¶

Create the FusedMoEQuantConfig for 4 or 8-bit WNA16 MoE.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def make_wna16_moe_quant_config(
    w1_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    group_size: int,
    num_bits: int,
    w1_zp: torch.Tensor | None = None,
    w2_zp: torch.Tensor | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    a1_gscale: torch.Tensor | None = None,
    a2_gscale: torch.Tensor | None = None,
    gemm1_clamp_limit: float | None = None,
    gemm1_alpha: float | None = None,
    gemm1_beta: float | None = None,
) -> FusedMoEQuantConfig:
    """Create the FusedMoEQuantConfig for 4 or 8-bit WNA16 MoE."""
    if num_bits == 4:
        return int4_w4a16_moe_quant_config(
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            w1_zp=w1_zp,
            w2_zp=w2_zp,
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            block_shape=[0, group_size],
            a1_gscale=a1_gscale,
            a2_gscale=a2_gscale,
            gemm1_clamp_limit=gemm1_clamp_limit,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
        )
    else:
        assert num_bits == 8
        return int8_w8a16_moe_quant_config(
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            w1_zp=w1_zp,
            w2_zp=w2_zp,
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            block_shape=[0, group_size],
            a1_gscale=a1_gscale,
            a2_gscale=a2_gscale,
            gemm1_clamp_limit=gemm1_clamp_limit,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
        )

`map_wna16_backend(runner_backend)` ¶

Map user's MoEBackend to WNA16MoEBackend.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def map_wna16_backend(runner_backend: MoEBackend) -> WNA16MoEBackend:
    """Map user's MoEBackend to WNA16MoEBackend."""
    mapping = {
        "triton": WNA16MoEBackend.TRITON,
        "marlin": WNA16MoEBackend.MARLIN,
        "humming": WNA16MoEBackend.HUMMING,
        "flashinfer_trtllm": WNA16MoEBackend.FLASHINFER_TRTLLM,
        "emulation": WNA16MoEBackend.EMULATION,
    }
    if backend := mapping.get(runner_backend):
        return backend
    raise ValueError(
        f"moe_backend='{runner_backend}' is not supported for WNA16 MoE. "
        f"Expected one of {list(mapping.keys())}."
    )

`select_wna16_moe_backend(config, weight_key, quant_config, may_have_zp, may_have_bias)` ¶

Select the WNA16 MoE backend.

Parameters:

config ¶
(FusedMoEConfig) –

the shared FusedMoEConfig for this layer.
weight_key ¶
(QuantKey) –

The QuantKey describing the weight quantization. Must have int4 or int8 type.
quant_config ¶
(QuantizationConfig | QuantizationArgs) –

Quantization structure and checkpoint format description.
may_have_zp ¶
(bool) –

Whether the integration can provide weight zero points.
may_have_bias ¶
(bool) –

Whether the integration can provide expert bias.

Returns:

tuple[WNA16MoEBackend, type[FusedMoEExperts]] –

A tuple of (WNA16MoEBackend, experts class or None).

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def select_wna16_moe_backend(
    config: FusedMoEConfig,
    weight_key: QuantKey,
    quant_config: QuantizationConfig | QuantizationArgs,
    may_have_zp: bool,
    may_have_bias: bool,
) -> tuple[WNA16MoEBackend, type[mk.FusedMoEExperts]]:
    """Select the WNA16 MoE backend.

    Args:
        config: the shared ``FusedMoEConfig`` for this layer.
        weight_key: The QuantKey describing the weight quantization.
                    Must have int4 or int8 type.
        quant_config: Quantization structure and checkpoint format description.
        may_have_zp: Whether the integration can provide weight zero points.
        may_have_bias: Whether the integration can provide expert bias.

    Returns:
        A tuple of (``WNA16MoEBackend``, experts class or ``None``).
    """

    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )

    def _make_log_backend(backend: WNA16MoEBackend):
        return f"Using '{backend.value}' WNA16 MoE backend."

    def _make_log_unsupported(backend: WNA16MoEBackend, reason: str | None) -> str:
        if reason:
            return (
                f"WNA16 MoE backend '{backend.value}' does not support the "
                f"deployment configuration since {reason}."
            )
        return (
            f"WNA16 MoE backend '{backend.value}' does not support the "
            "deployment configuration."
        )

    def _return_or_raise(
        backend: WNA16MoEBackend,
        config: FusedMoEConfig,
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
        activation_format: mk.FusedMoEActivationFormat,
    ) -> tuple[WNA16MoEBackend, type[mk.FusedMoEExperts]]:
        reason: str | None = None
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
                logger.info_once(_make_log_backend(backend), scope="local")
                return backend, k_cls
        raise ValueError(_make_log_unsupported(backend, reason))

    # Handle explicit moe_backend from user.
    runner_backend = config.moe_backend
    if runner_backend != "auto":
        requested_backend = map_wna16_backend(runner_backend)
        reason = _backend_incompatibility_reason(
            requested_backend, quant_config, may_have_zp, may_have_bias
        )
        if reason is not None:
            raise ValueError(_make_log_unsupported(requested_backend, reason))
        return _return_or_raise(
            requested_backend, config, weight_key, None, activation_format
        )

    # Select kernels in order of backend.
    AVAILABLE_BACKENDS = _get_priority_backends()

    for backend in AVAILABLE_BACKENDS:
        reason = _backend_incompatibility_reason(
            backend, quant_config, may_have_zp, may_have_bias
        )
        if reason is not None:
            logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
            continue
        activation_key = None  # always BF16 activation for WNA16 MoE
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
                logger.info_once(_make_log_backend(backend), scope="local")
                return backend, k_cls
            else:
                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")

    raise NotImplementedError(
        "No WNA16 MoE backend supports the deployment configuration."
    )

vllm.model_executor.layers.fused_moe.oracle.int_wna16 ¶

_MoeWNA16HummingWeightSchema ¶

_convert_moe_wna16_humming_tensors(tensors, has_zero_point) ¶

_get_priority_backends() ¶

_humming_wna16_weight_schema(quant_config) ¶

_pad_rows(x, padded_rows) ¶

_pad_w13_bias(bias, n, padded_n) ¶

_pad_w13_shard_cols(x, unit, padded_unit) ¶

_process_awq_weights_marlin(layer, weight_bits, pack_factor, group_size, input_dtype, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_qzeros, w2_qzeros, w13_bias=None, w2_bias=None) ¶

_process_weights_cpu(quant_config, w13, w2, w13_scale, w2_scale, w13_g_idx=None, w2_g_idx=None, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None) ¶

_process_weights_emulation_awq(w13, w2, w13_scale, w2_scale, w13_qzeros, w2_qzeros) ¶

_process_weights_emulation_gptq(w13, w2, w13_scale, w2_scale, w13_qzeros, w2_qzeros) ¶

_process_weights_flashinfer(w13_qweight, w2_qweight, w13_scales, w2_scales, w13_g_idx, w2_g_idx, w13_bias=None, w2_bias=None) ¶

Steps¶

_process_weights_marlin(layer, input_dtype, num_bits, pack_factor, group_size, actorder, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_g_idx, w2_g_idx, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None) ¶

Steps¶

_process_weights_xpu(layer, quant_config, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_bias=None, w2_bias=None) ¶

_unpack_and_dequant_int4_awq(w_int32, scale, qzeros, transpose_output, output_dtype=torch.bfloat16) ¶

w_int32 ¶

scale ¶

qzeros ¶

transpose_output ¶

output_dtype ¶

_unpack_and_dequant_int4_gptq(w_int32, scale, qzeros, transpose_output, output_dtype=torch.bfloat16) ¶

w_int32 ¶

scale ¶

qzeros ¶

transpose_output ¶

output_dtype ¶

backend_to_kernel_cls(backend) ¶

convert_to_wna16_moe_kernel_format(backend, layer, quant_config, input_dtype, w13, w2, w13_scale, w2_scale, w13_g_idx=None, w2_g_idx=None, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None) ¶

backend ¶

layer ¶

quant_config ¶

input_dtype ¶

make_wna16_moe_quant_config(w1_scale, w2_scale, group_size, num_bits, w1_zp=None, w2_zp=None, w1_bias=None, w2_bias=None, a1_gscale=None, a2_gscale=None, gemm1_clamp_limit=None, gemm1_alpha=None, gemm1_beta=None) ¶

map_wna16_backend(runner_backend) ¶

select_wna16_moe_backend(config, weight_key, quant_config, may_have_zp, may_have_bias) ¶

config ¶

weight_key ¶

quant_config ¶

may_have_zp ¶

may_have_bias ¶

`vllm.model_executor.layers.fused_moe.oracle.int_wna16` ¶

`_MoeWNA16HummingWeightSchema` ¶

`_convert_moe_wna16_humming_tensors(tensors, has_zero_point)` ¶

`_get_priority_backends()` ¶

`_humming_wna16_weight_schema(quant_config)` ¶

`_pad_rows(x, padded_rows)` ¶

`_pad_w13_bias(bias, n, padded_n)` ¶

`_pad_w13_shard_cols(x, unit, padded_unit)` ¶

`_process_awq_weights_marlin(layer, weight_bits, pack_factor, group_size, input_dtype, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_qzeros, w2_qzeros, w13_bias=None, w2_bias=None)` ¶

`_process_weights_cpu(quant_config, w13, w2, w13_scale, w2_scale, w13_g_idx=None, w2_g_idx=None, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None)` ¶

`_process_weights_emulation_awq(w13, w2, w13_scale, w2_scale, w13_qzeros, w2_qzeros)` ¶

`_process_weights_emulation_gptq(w13, w2, w13_scale, w2_scale, w13_qzeros, w2_qzeros)` ¶

`_process_weights_flashinfer(w13_qweight, w2_qweight, w13_scales, w2_scales, w13_g_idx, w2_g_idx, w13_bias=None, w2_bias=None)` ¶

`_process_weights_marlin(layer, input_dtype, num_bits, pack_factor, group_size, actorder, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_g_idx, w2_g_idx, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None)` ¶

`_process_weights_xpu(layer, quant_config, w13_qweight, w2_qweight, w13_scales, w2_scales, w13_bias=None, w2_bias=None)` ¶

`_unpack_and_dequant_int4_awq(w_int32, scale, qzeros, transpose_output, output_dtype=torch.bfloat16)` ¶

`w_int32` ¶

`scale` ¶

`qzeros` ¶

`transpose_output` ¶

`output_dtype` ¶

`_unpack_and_dequant_int4_gptq(w_int32, scale, qzeros, transpose_output, output_dtype=torch.bfloat16)` ¶

`w_int32` ¶

`scale` ¶

`qzeros` ¶

`transpose_output` ¶

`output_dtype` ¶

`backend_to_kernel_cls(backend)` ¶

`convert_to_wna16_moe_kernel_format(backend, layer, quant_config, input_dtype, w13, w2, w13_scale, w2_scale, w13_g_idx=None, w2_g_idx=None, w13_qzeros=None, w2_qzeros=None, w13_bias=None, w2_bias=None)` ¶

`backend` ¶

`layer` ¶

`quant_config` ¶

`input_dtype` ¶

`make_wna16_moe_quant_config(w1_scale, w2_scale, group_size, num_bits, w1_zp=None, w2_zp=None, w1_bias=None, w2_bias=None, a1_gscale=None, a2_gscale=None, gemm1_clamp_limit=None, gemm1_alpha=None, gemm1_beta=None)` ¶

`map_wna16_backend(runner_backend)` ¶

`select_wna16_moe_backend(config, weight_key, quant_config, may_have_zp, may_have_bias)` ¶

`config` ¶

`weight_key` ¶

`quant_config` ¶

`may_have_zp` ¶

`may_have_bias` ¶