`vllm.model_executor.layers.quantization.inc.schemes` ¶

Modules:

inc_ark_ops –
inc_mxfp4_linear –
inc_mxfp4_moe –
inc_mxfp4_scheme –
inc_scheme –
inc_wna16_linear –

Classes:

INCMxfp4Scheme –

MXFP4 (W4A4) scheme for AutoRound checkpoints.
INCScheme –

One class per quant type. Single registration point for the factory.

`INCMxfp4Scheme` ¶

Bases: INCScheme

MXFP4 (W4A4) scheme for AutoRound checkpoints.

Dispatches to :class:INCMxfp4LinearMethod for linear layers and :class:INCMxfp4MoEMethod for fused MoE layers; see those classes for the per-module weight layout and kernel-selection details.

Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_mxfp4_scheme.py

class INCMxfp4Scheme(INCScheme):
    """MXFP4 (W4A4) scheme for AutoRound checkpoints.

    Dispatches to :class:`INCMxfp4LinearMethod` for linear layers and
    :class:`INCMxfp4MoEMethod` for fused MoE layers; see those classes for the
    per-module weight layout and kernel-selection details.
    """

    @staticmethod
    def can_handle(layer_config: "INCLayerConfig") -> bool:
        return layer_config.is_mxfp4

    def get_linear_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ):
        del config, layer, prefix
        from .inc_mxfp4_linear import INCMxfp4LinearMethod

        return INCLinearMethod(INCMxfp4LinearMethod(layer_config))

    def get_moe_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ):
        del config, prefix, layer_config
        from .inc_mxfp4_moe import INCMxfp4MoEMethod

        return INCMxfp4MoEMethod(layer.moe_config)

`INCScheme` ¶

Bases: ABC

One class per quant type. Single registration point for the factory.

Each subclass defines

can_handle(): when does this scheme apply?
get_linear_method(): required — how to quantize Linear layers
get_moe_method(): optional — how to quantize MoE layers
get_kvcache_method(): optional — how to quantize KV cache

Schemes that don't support MoE/KVCache inherit the default raise.

Methods:

get_kvcache_method –

Optional. Override if this scheme supports KV cache quantization.
get_moe_method –

Optional. Override if this scheme supports MoE.

Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py

class INCScheme(ABC):
    """One class per quant type. Single registration point for the factory.

    Each subclass defines:
      - can_handle(): when does this scheme apply?
      - get_linear_method(): required — how to quantize Linear layers
      - get_moe_method(): optional — how to quantize MoE layers
      - get_kvcache_method(): optional — how to quantize KV cache

    Schemes that don't support MoE/KVCache inherit the default raise.
    """

    @staticmethod
    @abstractmethod
    def can_handle(layer_config: "INCLayerConfig") -> bool:
        raise NotImplementedError

    @abstractmethod
    def get_linear_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ) -> "LinearMethodBase":
        raise NotImplementedError

    def get_moe_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ) -> "FusedMoEMethodBase | None":
        """Optional. Override if this scheme supports MoE.
        Default raises NotImplementedError."""
        raise NotImplementedError(
            f"{type(self).__name__} does not support MoE layers. "
            f"Layer config: {layer_config}"
        )

    def get_kvcache_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ) -> "QuantizationMethods":
        """Optional. Override if this scheme supports KV cache quantization.
        Default raises NotImplementedError."""
        raise NotImplementedError(
            f"{type(self).__name__} does not support KV cache quantization. "
            f"Layer config: {layer_config}"
        )

`get_kvcache_method(config, layer, prefix, layer_config)` ¶

Optional. Override if this scheme supports KV cache quantization. Default raises NotImplementedError.

Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py

def get_kvcache_method(
    self,
    config: "INCConfig",
    layer: "torch.nn.Module",
    prefix: str,
    layer_config: "INCLayerConfig",
) -> "QuantizationMethods":
    """Optional. Override if this scheme supports KV cache quantization.
    Default raises NotImplementedError."""
    raise NotImplementedError(
        f"{type(self).__name__} does not support KV cache quantization. "
        f"Layer config: {layer_config}"
    )

`get_moe_method(config, layer, prefix, layer_config)` ¶

Optional. Override if this scheme supports MoE. Default raises NotImplementedError.

Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py

def get_moe_method(
    self,
    config: "INCConfig",
    layer: "torch.nn.Module",
    prefix: str,
    layer_config: "INCLayerConfig",
) -> "FusedMoEMethodBase | None":
    """Optional. Override if this scheme supports MoE.
    Default raises NotImplementedError."""
    raise NotImplementedError(
        f"{type(self).__name__} does not support MoE layers. "
        f"Layer config: {layer_config}"
    )

vllm.model_executor.layers.quantization.inc.schemes ¶

INCMxfp4Scheme ¶

INCScheme ¶

get_kvcache_method(config, layer, prefix, layer_config) ¶

get_moe_method(config, layer, prefix, layer_config) ¶

`vllm.model_executor.layers.quantization.inc.schemes` ¶

`INCMxfp4Scheme` ¶

`INCScheme` ¶

`get_kvcache_method(config, layer, prefix, layer_config)` ¶

`get_moe_method(config, layer, prefix, layer_config)` ¶