`vllm.model_executor.layers.fused_moe.oracle.int8` ¶

Functions:

convert_to_int8_moe_kernel_format –

Convert INT8 MoE weights to backend-specific kernel format.
map_int8_backend –

Map user's MoEBackend to Int8MoeBackend.
select_int8_moe_backend –

Select the primary Int8 MoE backend.

`_get_priority_backends(moe_config)` ¶

Get available backends in priority order based on platform and config.

Source code in vllm/model_executor/layers/fused_moe/oracle/int8.py

def _get_priority_backends(
    moe_config: FusedMoEConfig,
) -> list[Int8MoeBackend]:
    """
    Get available backends in priority order based on platform and config.
    """
    _AVAILABLE_BACKENDS = [
        Int8MoeBackend.TRITON,
        Int8MoeBackend.HUMMING,
        Int8MoeBackend.CPU,
    ]

    def _move_to_front(backends: list[Int8MoeBackend], backend: Int8MoeBackend) -> None:
        backends.insert(0, backends.pop(backends.index(backend)))

    if current_platform.is_cpu():
        _move_to_front(_AVAILABLE_BACKENDS, Int8MoeBackend.CPU)

    return _AVAILABLE_BACKENDS

`_humming_int8_weight_schema(weight, weight_scale)` ¶

Build the humming compressed-tensors int8 schema from the canonical on-device tensors; humming does the signed-int8 -> native conversion.

Source code in vllm/model_executor/layers/fused_moe/oracle/int8.py

def _humming_int8_weight_schema(
    weight: torch.Tensor, weight_scale: torch.Tensor
) -> dict[str, Any]:
    """Build the humming compressed-tensors int8 schema from the canonical
    on-device tensors; humming does the signed-int8 -> native conversion."""
    config: dict[str, Any] = {
        "quant_method": "compressed-tensors",
        "format": "int-quantized",
        "type": "int",
        "num_bits": 8,
        "symmetric": True,
        "strategy": "channel",
    }
    num_experts, num_output = weight.shape[0], weight.shape[-2]
    if weight_scale.numel() < num_experts * num_output:
        config["strategy"] = "tensor"
    return config

`convert_to_int8_moe_kernel_format(int8_backend, w13, w2, layer=None, w13_scale=None)` ¶

Convert INT8 MoE weights to backend-specific kernel format.

Source code in vllm/model_executor/layers/fused_moe/oracle/int8.py

def convert_to_int8_moe_kernel_format(
    int8_backend: Int8MoeBackend,
    w13: torch.Tensor,
    w2: torch.Tensor,
    layer: torch.nn.Module | None = None,
    w13_scale: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Convert INT8 MoE weights to backend-specific kernel format."""
    if int8_backend == Int8MoeBackend.HUMMING:
        from vllm.model_executor.layers.quantization.utils.humming_utils import (
            convert_to_humming_moe_kernel_format,
        )

        assert layer is not None
        # Humming reads canonical CT scales (w*_weight_scale) from the layer.
        # Online int8 produces per-channel (E, N) w*_scale; expose them as the
        # (E, N, 1) w*_weight_scale humming's loader expects.
        for sub in ("w13", "w2"):
            if hasattr(layer, f"{sub}_weight_scale"):
                continue
            scale = getattr(layer, f"{sub}_scale").data
            if scale.dim() < 3:
                scale = scale.unsqueeze(-1)
            replace_parameter(layer, f"{sub}_weight_scale", scale)
            delattr(layer, f"{sub}_scale")
        convert_to_humming_moe_kernel_format(
            layer,
            quant_config=_humming_int8_weight_schema(w13, layer.w13_weight_scale),
        )
        return layer.w13_weight, layer.w2_weight
    elif int8_backend == Int8MoeBackend.CPU:
        from vllm.model_executor.layers.fused_moe.experts.cpu_moe import (
            prepare_int8_moe_layer_for_cpu,
        )

        w13, w2 = prepare_int8_moe_layer_for_cpu(w13, w2)
    elif int8_backend != Int8MoeBackend.TRITON:
        raise ValueError(f"Unsupported Int8 MoE backend: {int8_backend.value}")

    return w13, w2

`map_int8_backend(runner_backend)` ¶

Map user's MoEBackend to Int8MoeBackend.

Source code in vllm/model_executor/layers/fused_moe/oracle/int8.py

def map_int8_backend(runner_backend: MoEBackend) -> Int8MoeBackend:
    """Map user's MoEBackend to Int8MoeBackend."""
    mapping = {
        "triton": Int8MoeBackend.TRITON,
        "humming": Int8MoeBackend.HUMMING,
    }
    if backend := mapping.get(runner_backend):
        return backend
    raise ValueError(
        f"moe_backend='{runner_backend}' is not supported for Int8 MoE. "
        f"Expected one of {list(mapping.keys())}."
    )

`select_int8_moe_backend(config, weight_key=kInt8StaticChannelSym, activation_key=kInt8DynamicTokenSym)` ¶

Select the primary Int8 MoE backend. Note: Shape-specific fallbacks may still occur at runtime.

Source code in vllm/model_executor/layers/fused_moe/oracle/int8.py

def select_int8_moe_backend(
    config: FusedMoEConfig,
    weight_key: QuantKey | None = kInt8StaticChannelSym,
    activation_key: QuantKey | None = kInt8DynamicTokenSym,
) -> tuple[Int8MoeBackend, type[mk.FusedMoEExperts]]:
    """
    Select the primary Int8 MoE backend.
    Note: Shape-specific fallbacks may still occur at runtime.
    """

    AVAILABLE_BACKENDS = _get_priority_backends(config)

    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )

    def _make_log_backend(backend: Int8MoeBackend) -> str:
        available_backend_strs = [b.value for b in AVAILABLE_BACKENDS]
        return (
            f"Using {backend.value} Int8 MoE backend out "
            f"of potential backends: {available_backend_strs}."
        )

    def _make_log_unsupported(backend: Int8MoeBackend, reason: str | None) -> str:
        if reason:
            return (
                f"Int8 MoE backend {backend.value} does not support the "
                f"deployment configuration since {reason}."
            )
        else:
            return (
                f"Int8 MoE backend '{backend.value}' does not support the "
                "deployment configuration."
            )

    def _return_or_raise(
        backend: Int8MoeBackend,
    ) -> tuple[Int8MoeBackend, type[mk.FusedMoEExperts]]:
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
        raise ValueError(_make_log_unsupported(backend, reason))

    # Handle explicit moe_backend from user.
    runner_backend = config.moe_backend
    if runner_backend != "auto":
        requested_backend = map_int8_backend(runner_backend)
        return _return_or_raise(requested_backend)

    # Select kernels in order of backend.
    for backend in AVAILABLE_BACKENDS:
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls,
                config,
                weight_key,
                activation_key,
                activation_format,
            )
            if supported:
                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
            else:
                logger.debug_once(_make_log_unsupported(backend, reason))

    raise NotImplementedError(
        "No Int8 MoE backend supports the deployment configuration."
    )

vllm.model_executor.layers.fused_moe.oracle.int8 ¶

_get_priority_backends(moe_config) ¶

_humming_int8_weight_schema(weight, weight_scale) ¶

convert_to_int8_moe_kernel_format(int8_backend, w13, w2, layer=None, w13_scale=None) ¶

map_int8_backend(runner_backend) ¶

select_int8_moe_backend(config, weight_key=kInt8StaticChannelSym, activation_key=kInt8DynamicTokenSym) ¶

`vllm.model_executor.layers.fused_moe.oracle.int8` ¶

`_get_priority_backends(moe_config)` ¶

`_humming_int8_weight_schema(weight, weight_scale)` ¶

`convert_to_int8_moe_kernel_format(int8_backend, w13, w2, layer=None, w13_scale=None)` ¶

`map_int8_backend(runner_backend)` ¶

`select_int8_moe_backend(config, weight_key=kInt8StaticChannelSym, activation_key=kInt8DynamicTokenSym)` ¶