Skip to content

vllm.model_executor.layers.quantization.inc.schemes

Modules:

Classes:

  • INCScheme

    One class per quant type. Single registration point for the factory.

INCScheme

Bases: ABC

One class per quant type. Single registration point for the factory.

Each subclass defines
  • can_handle(): when does this scheme apply?
  • get_linear_method(): required — how to quantize Linear layers
  • get_moe_method(): optional — how to quantize MoE layers
  • get_kvcache_method(): optional — how to quantize KV cache

Schemes that don't support MoE/KVCache inherit the default raise.

Methods:

Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py
class INCScheme(ABC):
    """One class per quant type. Single registration point for the factory.

    Each subclass defines:
      - can_handle(): when does this scheme apply?
      - get_linear_method(): required — how to quantize Linear layers
      - get_moe_method(): optional — how to quantize MoE layers
      - get_kvcache_method(): optional — how to quantize KV cache

    Schemes that don't support MoE/KVCache inherit the default raise.
    """

    @staticmethod
    @abstractmethod
    def can_handle(layer_config: "INCLayerConfig") -> bool:
        raise NotImplementedError

    @abstractmethod
    def get_linear_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ) -> "LinearMethodBase":
        raise NotImplementedError

    def get_moe_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ) -> "FusedMoEMethodBase | None":
        """Optional. Override if this scheme supports MoE.
        Default raises NotImplementedError."""
        raise NotImplementedError(
            f"{type(self).__name__} does not support MoE layers. "
            f"Layer config: {layer_config}"
        )

    def get_kvcache_method(
        self,
        config: "INCConfig",
        layer: "torch.nn.Module",
        prefix: str,
        layer_config: "INCLayerConfig",
    ) -> "QuantizationMethods":
        """Optional. Override if this scheme supports KV cache quantization.
        Default raises NotImplementedError."""
        raise NotImplementedError(
            f"{type(self).__name__} does not support KV cache quantization. "
            f"Layer config: {layer_config}"
        )

get_kvcache_method(config, layer, prefix, layer_config)

Optional. Override if this scheme supports KV cache quantization. Default raises NotImplementedError.

Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py
def get_kvcache_method(
    self,
    config: "INCConfig",
    layer: "torch.nn.Module",
    prefix: str,
    layer_config: "INCLayerConfig",
) -> "QuantizationMethods":
    """Optional. Override if this scheme supports KV cache quantization.
    Default raises NotImplementedError."""
    raise NotImplementedError(
        f"{type(self).__name__} does not support KV cache quantization. "
        f"Layer config: {layer_config}"
    )

get_moe_method(config, layer, prefix, layer_config)

Optional. Override if this scheme supports MoE. Default raises NotImplementedError.

Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py
def get_moe_method(
    self,
    config: "INCConfig",
    layer: "torch.nn.Module",
    prefix: str,
    layer_config: "INCLayerConfig",
) -> "FusedMoEMethodBase | None":
    """Optional. Override if this scheme supports MoE.
    Default raises NotImplementedError."""
    raise NotImplementedError(
        f"{type(self).__name__} does not support MoE layers. "
        f"Layer config: {layer_config}"
    )