Bases: ABC
One class per quant type. Single registration point for the factory.
Each subclass defines
- can_handle(): when does this scheme apply?
- get_linear_method(): required — how to quantize Linear layers
- get_moe_method(): optional — how to quantize MoE layers
- get_kvcache_method(): optional — how to quantize KV cache
Schemes that don't support MoE/KVCache inherit the default raise.
Methods:
Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py
| class INCScheme(ABC):
"""One class per quant type. Single registration point for the factory.
Each subclass defines:
- can_handle(): when does this scheme apply?
- get_linear_method(): required — how to quantize Linear layers
- get_moe_method(): optional — how to quantize MoE layers
- get_kvcache_method(): optional — how to quantize KV cache
Schemes that don't support MoE/KVCache inherit the default raise.
"""
@staticmethod
@abstractmethod
def can_handle(layer_config: "INCLayerConfig") -> bool:
raise NotImplementedError
@abstractmethod
def get_linear_method(
self,
config: "INCConfig",
layer: "torch.nn.Module",
prefix: str,
layer_config: "INCLayerConfig",
) -> "LinearMethodBase":
raise NotImplementedError
def get_moe_method(
self,
config: "INCConfig",
layer: "torch.nn.Module",
prefix: str,
layer_config: "INCLayerConfig",
) -> "FusedMoEMethodBase | None":
"""Optional. Override if this scheme supports MoE.
Default raises NotImplementedError."""
raise NotImplementedError(
f"{type(self).__name__} does not support MoE layers. "
f"Layer config: {layer_config}"
)
def get_kvcache_method(
self,
config: "INCConfig",
layer: "torch.nn.Module",
prefix: str,
layer_config: "INCLayerConfig",
) -> "QuantizationMethods":
"""Optional. Override if this scheme supports KV cache quantization.
Default raises NotImplementedError."""
raise NotImplementedError(
f"{type(self).__name__} does not support KV cache quantization. "
f"Layer config: {layer_config}"
)
|
get_kvcache_method(config, layer, prefix, layer_config)
Optional. Override if this scheme supports KV cache quantization. Default raises NotImplementedError.
Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py
| def get_kvcache_method(
self,
config: "INCConfig",
layer: "torch.nn.Module",
prefix: str,
layer_config: "INCLayerConfig",
) -> "QuantizationMethods":
"""Optional. Override if this scheme supports KV cache quantization.
Default raises NotImplementedError."""
raise NotImplementedError(
f"{type(self).__name__} does not support KV cache quantization. "
f"Layer config: {layer_config}"
)
|
get_moe_method(config, layer, prefix, layer_config)
Optional. Override if this scheme supports MoE. Default raises NotImplementedError.
Source code in vllm/model_executor/layers/quantization/inc/schemes/inc_scheme.py
| def get_moe_method(
self,
config: "INCConfig",
layer: "torch.nn.Module",
prefix: str,
layer_config: "INCLayerConfig",
) -> "FusedMoEMethodBase | None":
"""Optional. Override if this scheme supports MoE.
Default raises NotImplementedError."""
raise NotImplementedError(
f"{type(self).__name__} does not support MoE layers. "
f"Layer config: {layer_config}"
)
|