class INCConfig(QuantizationConfig):
"""Config class for Intel Neural Compressor (INC).
Repo: https://github.com/intel/neural-compressor
"""
SUPPORTED_BITS = {2, 3, 4, 8}
SUPPORTED_DTYPES = {"int"}
SUPPORTED_FORMATS = {"auto_round:auto_gptq", "auto_round:auto_awq"}
SUPPORTED_BACKENDS = {
"auto",
"gptq",
"gptq:marlin",
"awq",
"awq:marlin",
"marlin",
}
def __init__(
self,
weight_bits: int,
group_size: int,
sym: bool = True,
packing_format: str = "auto_round:auto_gptq",
block_name_to_quantize: str | list[str] | None = None,
extra_config: dict[str, Any] | None = None,
data_type: str = "int",
backend: str = "auto",
) -> None:
super().__init__()
if weight_bits not in self.SUPPORTED_BITS:
raise ValueError(
f"Unsupported weight_bits: {weight_bits}, "
f"currently only support {self.SUPPORTED_BITS}."
)
if data_type not in self.SUPPORTED_DTYPES:
raise ValueError(
f"Unsupported data_type: {data_type},"
f" currently only support {self.SUPPORTED_DTYPES}."
)
if packing_format not in self.SUPPORTED_FORMATS:
raise ValueError(
f"Unsupported packing_format: {packing_format}, "
f"currently only support {self.SUPPORTED_FORMATS}."
)
if backend not in self.SUPPORTED_BACKENDS:
raise ValueError(
f"Unsupported backend: {backend}, "
f"currently only support {self.SUPPORTED_BACKENDS}."
)
self.weight_bits = weight_bits
self.group_size = group_size
self.sym = sym
self.packing_format = packing_format
self.block_name_to_quantize = (
block_name_to_quantize.split(",")
if isinstance(block_name_to_quantize, str)
else block_name_to_quantize
)
self.extra_config = extra_config
self.data_type = data_type
self.backend = backend
self.pack_factor = Fraction(32, weight_bits)
self.config_parser = INCConfigParser(self)
def __repr__(self) -> str:
return (
f"INCConfig(weight_bits={self.weight_bits}, "
f"group_size={self.group_size}, sym={self.sym})"
)
@classmethod
def get_name(cls) -> QuantizationMethods:
return "inc"
@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
return [torch.half, torch.bfloat16]
@classmethod
def get_min_capability(cls) -> int:
return 60
@classmethod
def get_config_filenames(cls) -> list[str]:
return ["quantization_config.json"]
@classmethod
def from_config(cls, config: dict[str, Any]) -> "INCConfig":
return cls(
weight_bits=cls.get_from_keys(config, ["bits"]),
group_size=cls.get_from_keys(config, ["group_size"]),
sym=cls.get_from_keys(config, ["sym"]),
packing_format=cls.get_from_keys_or(
config, ["packing_format"], "auto_round:auto_gptq"
),
block_name_to_quantize=cls.get_from_keys_or(
config, ["block_name_to_quantize", "to_quant_block_names"], None
),
extra_config=cls.get_from_keys_or(config, ["extra_config"], None),
data_type=cls.get_from_keys_or(config, ["data_type"], "int"),
backend=cls.get_from_keys_or(config, ["backend", "vllm_backend"], "auto"),
)
def get_layer_config(self, layer, layer_name: str):
return self.config_parser.get_layer_config(layer, layer_name)
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
if self.block_name_to_quantize is not None:
self.block_name_to_quantize = hf_to_vllm_mapper.apply_list(
self.block_name_to_quantize
)
if self.extra_config is not None:
self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config)
def get_quant_method(self, layer: torch.nn.Module, prefix: str):
from .schemes.factory import resolve_scheme
# Match original: check model.-prefixed names for unquantized layers
if prefix and self.extra_config:
for layer_name in self.extra_config:
if (
layer_name == prefix or layer_name == f"model.{prefix}"
) and self.extra_config[layer_name].get("bits", 16) >= 16:
if isinstance(layer, RoutedExperts):
return UnquantizedFusedMoEMethod(layer.moe_config)
return UnquantizedLinearMethod()
layer_config = self.config_parser.resolve(layer, prefix)
if not layer_config.quantized:
if isinstance(layer, (LinearBase, ParallelLMHead)):
return UnquantizedLinearMethod()
if isinstance(layer, RoutedExperts):
return UnquantizedFusedMoEMethod(layer.moe_config)
return None
logger.debug(
"[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
prefix,
layer.__class__.__name__,
layer_config.bits,
layer_config.group_size,
layer_config.sym,
)
scheme = resolve_scheme(layer_config)
if isinstance(layer, (LinearBase, ParallelLMHead)):
return scheme.get_linear_method(self, layer, prefix, layer_config)
if isinstance(layer, RoutedExperts):
return scheme.get_moe_method(self, layer, prefix, layer_config)
return None
@classmethod
def override_quantization_method(
cls, hf_quant_cfg, user_quant, hf_config=None
) -> "QuantizationMethods | None":
"""Override the `auto-round` method to `inc`."""
is_auto_round_format = hf_quant_cfg.get("quant_method", None) == "auto-round"
if is_auto_round_format:
return cls.get_name()
return None