Skip to content

vllm.model_executor.kernels.linear.scaled_mm.ScaledMMLinearKernel

Classes:

ScaledMMLinearKernel

Bases: Generic[_ConfigT, _ParamsT], ABC

Methods:

  • input_quant_key

    The activation quant key this kernel can consume pre-quantized.

Source code in vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py
class ScaledMMLinearKernel(Generic[_ConfigT, _ParamsT], ABC):
    @classmethod
    @abstractmethod
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def can_implement(cls, c: _ConfigT) -> tuple[bool, str | None]:
        raise NotImplementedError

    def __init__(self, c: _ConfigT, layer_param_names: Sequence[str]) -> None:
        assert self.can_implement(c)[0]
        assert self.is_supported()[0]
        self.config = c
        self.layer_param_names = layer_param_names

    def input_quant_key(self) -> QuantKey | None:
        """The activation quant key this kernel can consume pre-quantized.

        Manual fusion uses this to decide whether to hoist activation
        quantization out of apply_weights into an upstream fused kernel.
        Return None when the kernel needs in-kernel quantization (custom
        padding or swizzling, dynamic scales, etc.). Kernels that return a
        key must consume the activation via as_quantized_activation.
        """
        return None

    @abstractmethod
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        raise NotImplementedError

    @abstractmethod
    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        raise NotImplementedError

    # return a covariant type in the subclass
    @abstractmethod
    def _get_layer_params(self, layer) -> _ParamsT:
        raise NotImplementedError

input_quant_key()

The activation quant key this kernel can consume pre-quantized.

Manual fusion uses this to decide whether to hoist activation quantization out of apply_weights into an upstream fused kernel. Return None when the kernel needs in-kernel quantization (custom padding or swizzling, dynamic scales, etc.). Kernels that return a key must consume the activation via as_quantized_activation.

Source code in vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py
def input_quant_key(self) -> QuantKey | None:
    """The activation quant key this kernel can consume pre-quantized.

    Manual fusion uses this to decide whether to hoist activation
    quantization out of apply_weights into an upstream fused kernel.
    Return None when the kernel needs in-kernel quantization (custom
    padding or swizzling, dynamic scales, etc.). Kernels that return a
    key must consume the activation via as_quantized_activation.
    """
    return None