Skip to content

`vllm.model_executor.layers.fused_moe.oracle` ¶

Modules:

base –

Abstract base class for MoE kernel oracles.
fp8 –
int8 –
int_wna16 –
mxfp4 –
mxfp8 –
nvfp4 –
unquantized –
w4a8_int8 –

Classes:

MoEKernelOracle –

Abstract base for MoE kernel-selection oracles.
UnquantizedMoEKernelOracle –

Class-based view of the unquantized MoE kernel oracle.

`MoEKernelOracle` ¶

Bases: ABC, Generic[BackendT]

Abstract base for MoE kernel-selection oracles.

Concrete oracles MUST implement: backend_enum_cls, get_priority_backends, backend_to_kernel_cls, map_backend, select_backend, make_kernel.

Concrete oracles MAY override: convert_to_kernel_format, make_quant_config. The base class provides default implementations that are appropriate for oracles which do not need them (e.g. make_quant_config raises on the unquantized oracle).

Methods:

backend_enum_cls –

Return the concrete Enum class enumerating this oracle's
backend_to_kernel_cls –

Map a backend enum value to its concrete FusedMoEExperts
convert_to_kernel_format –

Shuffle weights into the layout expected by backend.
get_priority_backends –

Return platform-appropriate backends in priority order for
make_kernel –

Construct the FusedMoEKernel (Prepare/Finalize + Experts
make_quant_config –

Build a FusedMoEQuantConfig for this oracle.
map_backend –

Map a user-facing MoEBackend (from the runner config) to
select_backend –

Primary entry point: choose the best supported backend for

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

class MoEKernelOracle(ABC, Generic[BackendT]):
    """Abstract base for MoE kernel-selection oracles.

    Concrete oracles MUST implement: `backend_enum_cls`,
    `get_priority_backends`, `backend_to_kernel_cls`, `map_backend`,
    `select_backend`, `make_kernel`.

    Concrete oracles MAY override: `convert_to_kernel_format`,
    `make_quant_config`. The base class provides default implementations
    that are appropriate for oracles which do not need them
    (e.g. `make_quant_config` raises on the unquantized oracle).
    """

    @abstractmethod
    def backend_enum_cls(self) -> type[BackendT]:
        """Return the concrete `Enum` class enumerating this oracle's
        backends (e.g. `UnquantizedMoeBackend`, `Fp8MoeBackend`)."""

    @abstractmethod
    def get_priority_backends(self, moe_config: FusedMoEConfig) -> list[BackendT]:
        """Return platform-appropriate backends in priority order for
        this `moe_config`."""

    @abstractmethod
    def backend_to_kernel_cls(
        self, backend: BackendT
    ) -> list[type[mk.FusedMoEExperts]]:
        """Map a backend enum value to its concrete `FusedMoEExperts`
        subclasses, in selection priority order."""

    @abstractmethod
    def map_backend(self, runner_backend: MoEBackend) -> BackendT:
        """Map a user-facing `MoEBackend` (from the runner config) to
        this oracle's enum."""

    @abstractmethod
    def select_backend(
        self,
        moe_config: FusedMoEConfig,
        weight_key: "QuantKey | None" = None,
        activation_key: "QuantKey | None" = None,
    ) -> tuple[BackendT, type[mk.FusedMoEExperts] | None]:
        """Primary entry point: choose the best supported backend for
        the given `moe_config`.

        `weight_key` / `activation_key` carry the quantization scheme of
        the weights and activations and are consumed by quantized oracles
        (fp8, nvfp4, int8, ...) to disambiguate backends. The unquantized
        oracle ignores them. Subclasses with additional selection inputs
        (e.g. int_wna16 needs `weight_bits`, fp8 needs
        `allow_vllm_cutlass`) widen the signature in their override; a
        per-oracle config object is the longer-term target tracked in
        the #37753 follow-up PRs.
        """

    @abstractmethod
    def make_kernel(
        self,
        quant_config: FusedMoEQuantConfig,
        moe_config: FusedMoEConfig,
        backend: BackendT,
        experts_cls: type[mk.FusedMoEExperts],
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> mk.FusedMoEKernel:
        """Construct the `FusedMoEKernel` (Prepare/Finalize + Experts
        combinator) for the chosen backend."""

    def convert_to_kernel_format(
        self,
        backend: BackendT,
        moe_config: FusedMoEConfig,
        w13_weight: torch.Tensor,
        w2_weight: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Shuffle weights into the layout expected by `backend`.

        Default implementation returns the inputs unchanged. Oracles
        whose backends need weight permutation should override this
        (e.g. `UnquantizedMoEKernelOracle` handles AITER and FlashInfer
        layouts).

        `moe_config` carries MoE-layer state (e.g. `is_act_and_mul`)
        that the conversion needs without coupling the oracle to a
        `Module` reference. Quantized oracles whose conversion
        additionally needs scales / zero-points / block shapes will
        override with a wider signature (and ultimately a per-oracle
        config object — tracked in the #37753 follow-up PRs).
        """
        return w13_weight, w2_weight

    def make_quant_config(self, *args, **kwargs) -> FusedMoEQuantConfig:
        """Build a `FusedMoEQuantConfig` for this oracle.

        Quantized oracles (fp8, nvfp4, mxfp4, ...) override this with
        the appropriate signature for their quantization scheme.
        Unquantized oracles inherit the default, which raises because
        there is no quantization-specific config to build.
        """
        raise NotImplementedError(
            f"{type(self).__name__} does not implement make_quant_config; "
            "this oracle has no quantization-specific config to build."
        )

`backend_enum_cls()` `abstractmethod` ¶

Return the concrete Enum class enumerating this oracle's backends (e.g. UnquantizedMoeBackend, Fp8MoeBackend).

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

@abstractmethod
def backend_enum_cls(self) -> type[BackendT]:
    """Return the concrete `Enum` class enumerating this oracle's
    backends (e.g. `UnquantizedMoeBackend`, `Fp8MoeBackend`)."""

`backend_to_kernel_cls(backend)` `abstractmethod` ¶

Map a backend enum value to its concrete FusedMoEExperts subclasses, in selection priority order.

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

@abstractmethod
def backend_to_kernel_cls(
    self, backend: BackendT
) -> list[type[mk.FusedMoEExperts]]:
    """Map a backend enum value to its concrete `FusedMoEExperts`
    subclasses, in selection priority order."""

`convert_to_kernel_format(backend, moe_config, w13_weight, w2_weight)` ¶

Shuffle weights into the layout expected by backend.

Default implementation returns the inputs unchanged. Oracles whose backends need weight permutation should override this (e.g. UnquantizedMoEKernelOracle handles AITER and FlashInfer layouts).

moe_config carries MoE-layer state (e.g. is_act_and_mul) that the conversion needs without coupling the oracle to a Module reference. Quantized oracles whose conversion additionally needs scales / zero-points / block shapes will override with a wider signature (and ultimately a per-oracle config object — tracked in the #37753 follow-up PRs).

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

def convert_to_kernel_format(
    self,
    backend: BackendT,
    moe_config: FusedMoEConfig,
    w13_weight: torch.Tensor,
    w2_weight: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Shuffle weights into the layout expected by `backend`.

    Default implementation returns the inputs unchanged. Oracles
    whose backends need weight permutation should override this
    (e.g. `UnquantizedMoEKernelOracle` handles AITER and FlashInfer
    layouts).

    `moe_config` carries MoE-layer state (e.g. `is_act_and_mul`)
    that the conversion needs without coupling the oracle to a
    `Module` reference. Quantized oracles whose conversion
    additionally needs scales / zero-points / block shapes will
    override with a wider signature (and ultimately a per-oracle
    config object — tracked in the #37753 follow-up PRs).
    """
    return w13_weight, w2_weight

`get_priority_backends(moe_config)` `abstractmethod` ¶

Return platform-appropriate backends in priority order for this moe_config.

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

@abstractmethod
def get_priority_backends(self, moe_config: FusedMoEConfig) -> list[BackendT]:
    """Return platform-appropriate backends in priority order for
    this `moe_config`."""

`make_kernel(quant_config, moe_config, backend, experts_cls, routing_tables=None)` `abstractmethod` ¶

Construct the FusedMoEKernel (Prepare/Finalize + Experts combinator) for the chosen backend.

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

@abstractmethod
def make_kernel(
    self,
    quant_config: FusedMoEQuantConfig,
    moe_config: FusedMoEConfig,
    backend: BackendT,
    experts_cls: type[mk.FusedMoEExperts],
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> mk.FusedMoEKernel:
    """Construct the `FusedMoEKernel` (Prepare/Finalize + Experts
    combinator) for the chosen backend."""

`make_quant_config(*args, **kwargs)` ¶

Build a FusedMoEQuantConfig for this oracle.

Quantized oracles (fp8, nvfp4, mxfp4, ...) override this with the appropriate signature for their quantization scheme. Unquantized oracles inherit the default, which raises because there is no quantization-specific config to build.

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

def make_quant_config(self, *args, **kwargs) -> FusedMoEQuantConfig:
    """Build a `FusedMoEQuantConfig` for this oracle.

    Quantized oracles (fp8, nvfp4, mxfp4, ...) override this with
    the appropriate signature for their quantization scheme.
    Unquantized oracles inherit the default, which raises because
    there is no quantization-specific config to build.
    """
    raise NotImplementedError(
        f"{type(self).__name__} does not implement make_quant_config; "
        "this oracle has no quantization-specific config to build."
    )

`map_backend(runner_backend)` `abstractmethod` ¶

Map a user-facing MoEBackend (from the runner config) to this oracle's enum.

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

@abstractmethod
def map_backend(self, runner_backend: MoEBackend) -> BackendT:
    """Map a user-facing `MoEBackend` (from the runner config) to
    this oracle's enum."""

`select_backend(moe_config, weight_key=None, activation_key=None)` `abstractmethod` ¶

Primary entry point: choose the best supported backend for the given moe_config.

weight_key / activation_key carry the quantization scheme of the weights and activations and are consumed by quantized oracles (fp8, nvfp4, int8, ...) to disambiguate backends. The unquantized oracle ignores them. Subclasses with additional selection inputs (e.g. int_wna16 needs weight_bits, fp8 needs allow_vllm_cutlass) widen the signature in their override; a per-oracle config object is the longer-term target tracked in the #37753 follow-up PRs.

Source code in vllm/model_executor/layers/fused_moe/oracle/base.py

@abstractmethod
def select_backend(
    self,
    moe_config: FusedMoEConfig,
    weight_key: "QuantKey | None" = None,
    activation_key: "QuantKey | None" = None,
) -> tuple[BackendT, type[mk.FusedMoEExperts] | None]:
    """Primary entry point: choose the best supported backend for
    the given `moe_config`.

    `weight_key` / `activation_key` carry the quantization scheme of
    the weights and activations and are consumed by quantized oracles
    (fp8, nvfp4, int8, ...) to disambiguate backends. The unquantized
    oracle ignores them. Subclasses with additional selection inputs
    (e.g. int_wna16 needs `weight_bits`, fp8 needs
    `allow_vllm_cutlass`) widen the signature in their override; a
    per-oracle config object is the longer-term target tracked in
    the #37753 follow-up PRs.
    """

`UnquantizedMoEKernelOracle` ¶

Bases: MoEKernelOracle[UnquantizedMoeBackend]

Class-based view of the unquantized MoE kernel oracle.

Each method delegates to its module-level counterpart so that instantiating and calling this class is bit-identical to calling the standalone functions. Follow-up PRs may move logic from the module-level functions into these methods.

Source code in vllm/model_executor/layers/fused_moe/oracle/unquantized.py

class UnquantizedMoEKernelOracle(MoEKernelOracle[UnquantizedMoeBackend]):
    """Class-based view of the unquantized MoE kernel oracle.

    Each method delegates to its module-level counterpart so that
    instantiating and calling this class is bit-identical to calling
    the standalone functions. Follow-up PRs may move logic from the
    module-level functions into these methods.
    """

    def backend_enum_cls(self) -> type[UnquantizedMoeBackend]:
        return UnquantizedMoeBackend

    def get_priority_backends(
        self, moe_config: FusedMoEConfig
    ) -> list[UnquantizedMoeBackend]:
        return _get_priority_backends(moe_config)

    def backend_to_kernel_cls(
        self, backend: UnquantizedMoeBackend
    ) -> list[type[mk.FusedMoEExperts]]:
        return backend_to_kernel_cls(backend)

    def map_backend(self, runner_backend: MoEBackend) -> UnquantizedMoeBackend:
        return map_unquantized_backend(runner_backend)

    def select_backend(
        self,
        moe_config: FusedMoEConfig,
        weight_key: "QuantKey | None" = None,
        activation_key: "QuantKey | None" = None,
    ) -> tuple[UnquantizedMoeBackend, type[mk.FusedMoEExperts] | None]:
        assert weight_key is None and activation_key is None, (
            "Weights and activations will never be quantized for "
            "UnquantizedMoEKernelOracle"
        )
        return select_unquantized_moe_backend(moe_config)

    def convert_to_kernel_format(
        self,
        backend: UnquantizedMoeBackend,
        moe_config: FusedMoEConfig,
        w13_weight: torch.Tensor,
        w2_weight: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        return convert_to_unquantized_kernel_format(
            backend, moe_config, w13_weight, w2_weight
        )

    def make_kernel(
        self,
        quant_config: FusedMoEQuantConfig,
        moe_config: FusedMoEConfig,
        backend: UnquantizedMoeBackend,
        experts_cls: type[mk.FusedMoEExperts],
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> mk.FusedMoEKernel:
        return make_unquantized_moe_kernel(
            quant_config, moe_config, backend, experts_cls, routing_tables
        )