`vllm.model_executor.kernels.linear.nvfp4.flashinfer` ¶

Classes:

FlashInferB12xNvFp4LinearKernel –

NVFP4 GEMM via FlashInfer's b12x CuTe DSL warp-level MMA kernel (SM120+).
FlashInferCudnnNvFp4LinearKernel –

NVFP4 GEMM via FlashInfer's cuDNN wrapper.
FlashInferCuteDslNvFp4LinearKernel –

NVFP4 GEMM via FlashInfer's cutedsl backend.
FlashInferCutlassNvFp4LinearKernel –

NVFP4 GEMM via FlashInfer's CUTLASS wrapper.
FlashInferTrtllmNvFp4LinearKernel –

NVFP4 GEMM via FlashInfer's TensorRT-LLM wrapper.

`FlashInferB12xNvFp4LinearKernel` ¶

Bases: NvFp4LinearKernel

NVFP4 GEMM via FlashInfer's b12x CuTe DSL warp-level MMA kernel (SM120+).

Source code in vllm/model_executor/kernels/linear/nvfp4/flashinfer.py

class FlashInferB12xNvFp4LinearKernel(NvFp4LinearKernel):
    """NVFP4 GEMM via FlashInfer's b12x CuTe DSL warp-level MMA kernel (SM120+)."""

    @classmethod
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if current_platform.has_device_capability(120) and has_flashinfer_b12x_gemm():
            return True, None
        return (
            False,
            "FlashInfer b12x requires SM120+ and FlashInfer "
            "with Sm120BlockScaledDenseGemmKernel",
        )

    @classmethod
    def can_implement(cls, config: NvFp4LinearLayerConfig) -> tuple[bool, str | None]:
        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        layer.weight_scale = torch.nn.Parameter(
            swizzle_blockscale(layer.weight_scale.data), requires_grad=False
        )
        padded_weight, weights_padding_cols = pad_nvfp4_weight_for_cutlass(
            layer.weight.data
        )
        layer.weight = torch.nn.Parameter(padded_weight, requires_grad=False)
        layer.weights_padding_cols = weights_padding_cols

    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        output_size = layer.output_size_per_partition
        output_dtype = x.dtype
        output_shape = [*x.shape[:-1], output_size]

        x_fp4, x_blockscale = scaled_fp4_quant(
            x,
            layer.input_global_scale_inv,
            is_sf_swizzled_layout=True,
            backend="b12x",
        )

        x_fp4 = pad_nvfp4_activation_for_cutlass(
            x_fp4, getattr(layer, "weights_padding_cols", 0)
        )

        out = flashinfer_scaled_fp4_mm(
            x_fp4,
            layer.weight,
            x_blockscale,
            layer.weight_scale,
            layer.alpha,
            output_dtype,
            backend="b12x",
        )

        out = slice_nvfp4_output(out, output_size)

        if bias is not None:
            out = out + bias
        return out.view(*output_shape)

`FlashInferCudnnNvFp4LinearKernel` ¶

Bases: NvFp4LinearKernel

NVFP4 GEMM via FlashInfer's cuDNN wrapper.

Source code in vllm/model_executor/kernels/linear/nvfp4/flashinfer.py

class FlashInferCudnnNvFp4LinearKernel(NvFp4LinearKernel):
    """NVFP4 GEMM via FlashInfer's cuDNN wrapper."""

    @classmethod
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if has_flashinfer():
            return True, None
        return False, "FlashInfer required"

    @classmethod
    def can_implement(cls, config: NvFp4LinearLayerConfig) -> tuple[bool, str | None]:
        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        # cuDNN uses the same swizzled + padded layout as CUTLASS
        layer.weight_scale = torch.nn.Parameter(
            swizzle_blockscale(layer.weight_scale.data), requires_grad=False
        )
        padded_weight, weights_padding_cols = pad_nvfp4_weight_for_cutlass(
            layer.weight.data
        )
        layer.weight = torch.nn.Parameter(padded_weight, requires_grad=False)
        layer.weights_padding_cols = weights_padding_cols

    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        output_size = layer.output_size_per_partition
        output_dtype = x.dtype
        output_shape = [*x.shape[:-1], output_size]
        weights_padding_bytes = getattr(layer, "weights_padding_cols", 0)

        x_fp4, x_blockscale = scaled_fp4_quant(
            x,
            layer.input_global_scale_inv,
            is_sf_swizzled_layout=True,
            backend="flashinfer-cudnn",
            padded_n=x.shape[-1] + weights_padding_bytes * 2,
        )

        out = flashinfer_scaled_fp4_mm(
            x_fp4,
            layer.weight,
            x_blockscale,
            layer.weight_scale,
            layer.alpha,
            output_dtype,
            backend="cudnn",
        )

        out = slice_nvfp4_output(out, output_size)

        if bias is not None:
            out = out + bias
        return out.view(*output_shape)

`FlashInferCuteDslNvFp4LinearKernel` ¶

Bases: NvFp4LinearKernel

NVFP4 GEMM via FlashInfer's cutedsl backend.

Source code in vllm/model_executor/kernels/linear/nvfp4/flashinfer.py

class FlashInferCuteDslNvFp4LinearKernel(NvFp4LinearKernel):
    """NVFP4 GEMM via FlashInfer's cutedsl backend."""

    @classmethod
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if not current_platform.is_device_capability_family(100):
            return False, "FlashInfer cutedsl requires sm_10x"
        if not has_flashinfer():
            return False, "FlashInfer required"
        return True, None

    @classmethod
    def can_implement(cls, config: NvFp4LinearLayerConfig) -> tuple[bool, str | None]:
        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        # cutedsl uses the same swizzled + padded layout as cutlass.
        layer.weight_scale = torch.nn.Parameter(
            swizzle_blockscale(layer.weight_scale.data), requires_grad=False
        )
        padded_weight, weights_padding_cols = pad_nvfp4_weight_for_cutlass(
            layer.weight.data
        )
        layer.weight = torch.nn.Parameter(padded_weight, requires_grad=False)
        layer.weights_padding_cols = weights_padding_cols

    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        output_size = layer.output_size_per_partition
        output_dtype = x.dtype
        output_shape = [*x.shape[:-1], output_size]

        x_fp4, x_blockscale = scaled_fp4_quant(
            x,
            layer.input_global_scale_inv,
            is_sf_swizzled_layout=True,
            backend="flashinfer-cutedsl",
        )

        x_fp4 = pad_nvfp4_activation_for_cutlass(
            x_fp4, getattr(layer, "weights_padding_cols", 0)
        )

        out = flashinfer_scaled_fp4_mm(
            x_fp4,
            layer.weight,
            x_blockscale,
            layer.weight_scale,
            layer.alpha,
            output_dtype,
            backend="cute-dsl",
        )

        out = slice_nvfp4_output(out, output_size)

        if bias is not None:
            out = out + bias
        return out.view(*output_shape)

`FlashInferCutlassNvFp4LinearKernel` ¶

Bases: NvFp4LinearKernel

NVFP4 GEMM via FlashInfer's CUTLASS wrapper.

Methods:

input_quant_key –

This kernel supports dynamic quantization of the input. By

Source code in vllm/model_executor/kernels/linear/nvfp4/flashinfer.py

class FlashInferCutlassNvFp4LinearKernel(NvFp4LinearKernel):
    """NVFP4 GEMM via FlashInfer's CUTLASS wrapper."""

    def input_quant_key(self) -> QuantKey | None:
        """This kernel supports dynamic quantization of the input. By
        convention, pre-quantized blockscales must use the swizzled layout."""
        return kNvfp4Dynamic

    @classmethod
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
            cutlass_fp4_supported,
        )

        if (
            cutlass_fp4_supported()
            and current_platform.has_device_capability(100)
            and has_flashinfer()
        ):
            return True, None
        return False, "FlashInfer + >=sm_100 required"

    @classmethod
    def can_implement(cls, config: NvFp4LinearLayerConfig) -> tuple[bool, str | None]:
        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        layer.weight_scale = torch.nn.Parameter(
            swizzle_blockscale(layer.weight_scale.data), requires_grad=False
        )
        padded_weight, weights_padding_cols = pad_nvfp4_weight_for_cutlass(
            layer.weight.data
        )
        layer.weight = torch.nn.Parameter(padded_weight, requires_grad=False)
        layer.weights_padding_cols = weights_padding_cols

    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor | QuantizedActivation,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        output_size = layer.output_size_per_partition
        weights_padding_bytes = getattr(layer, "weights_padding_cols", 0)

        qa = as_quantized_activation(x, self.input_quant_key())
        if qa is not None:
            x_fp4, x_blockscale = qa.data, qa.scale
            x_fp4 = pad_nvfp4_activation_for_cutlass(x_fp4, weights_padding_bytes)
            output_dtype = qa.orig_dtype
            output_shape = [*qa.orig_shape[:-1], output_size]
        else:
            assert isinstance(x, torch.Tensor)
            output_dtype = x.dtype
            output_shape = [*x.shape[:-1], output_size]
            x_fp4, x_blockscale = scaled_fp4_quant(
                x,
                layer.input_global_scale_inv,
                is_sf_swizzled_layout=True,
                backend="flashinfer-cutlass",
                padded_n=x.shape[-1] + weights_padding_bytes * 2,
            )

        out = flashinfer_scaled_fp4_mm(
            x_fp4,
            layer.weight,
            x_blockscale,
            layer.weight_scale,
            layer.alpha,
            output_dtype,
            backend="cutlass",
        )

        out = slice_nvfp4_output(out, output_size)

        if bias is not None:
            out = out + bias
        return out.view(*output_shape)

`input_quant_key()` ¶

This kernel supports dynamic quantization of the input. By convention, pre-quantized blockscales must use the swizzled layout.

Source code in vllm/model_executor/kernels/linear/nvfp4/flashinfer.py

def input_quant_key(self) -> QuantKey | None:
    """This kernel supports dynamic quantization of the input. By
    convention, pre-quantized blockscales must use the swizzled layout."""
    return kNvfp4Dynamic

`FlashInferTrtllmNvFp4LinearKernel` ¶

Bases: NvFp4LinearKernel

NVFP4 GEMM via FlashInfer's TensorRT-LLM wrapper.

Source code in vllm/model_executor/kernels/linear/nvfp4/flashinfer.py

class FlashInferTrtllmNvFp4LinearKernel(NvFp4LinearKernel):
    """NVFP4 GEMM via FlashInfer's TensorRT-LLM wrapper."""

    @classmethod
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if has_flashinfer():
            return True, None
        return False, "FlashInfer required"

    @classmethod
    def can_implement(cls, config: NvFp4LinearLayerConfig) -> tuple[bool, str | None]:
        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a

        weight = layer.weight.data
        weight_scale = layer.weight_scale.data
        epilogue_tile_m = 128

        layer.weight = torch.nn.Parameter(
            shuffle_matrix_a(weight.view(torch.uint8), epilogue_tile_m),
            requires_grad=False,
        )
        layer.weight_scale = torch.nn.Parameter(
            shuffle_matrix_sf_a(weight_scale.view(torch.uint8), epilogue_tile_m)
            .reshape(weight_scale.shape)
            .view(torch.float8_e4m3fn),
            requires_grad=False,
        )

    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        output_size = layer.output_size_per_partition
        output_dtype = x.dtype
        output_shape = [*x.shape[:-1], output_size]

        x_fp4, x_blockscale = scaled_fp4_quant(
            x,
            layer.input_global_scale_inv,
            is_sf_swizzled_layout=True,
            backend="flashinfer-trtllm",
        )

        out = flashinfer_scaled_fp4_mm(
            x_fp4,
            layer.weight,
            x_blockscale,
            layer.weight_scale,
            layer.alpha,
            output_dtype,
            backend="trtllm",
        )

        out = slice_nvfp4_output(out, output_size)

        if bias is not None:
            out = out + bias
        return out.view(*output_shape)

vllm.model_executor.kernels.linear.nvfp4.flashinfer ¶

FlashInferB12xNvFp4LinearKernel ¶

FlashInferCudnnNvFp4LinearKernel ¶

FlashInferCuteDslNvFp4LinearKernel ¶

FlashInferCutlassNvFp4LinearKernel ¶

input_quant_key() ¶

FlashInferTrtllmNvFp4LinearKernel ¶

`vllm.model_executor.kernels.linear.nvfp4.flashinfer` ¶

`FlashInferB12xNvFp4LinearKernel` ¶

`FlashInferCudnnNvFp4LinearKernel` ¶

`FlashInferCuteDslNvFp4LinearKernel` ¶

`FlashInferCutlassNvFp4LinearKernel` ¶

`input_quant_key()` ¶

`FlashInferTrtllmNvFp4LinearKernel` ¶