`vllm.v1.attention.backends.mla.prefill` ¶

Modules:

aiter_flash_attn –

AITER FlashAttention backend for MLA prefill (ROCm).
base –

Abstract base class for MLA prefill backends.
flash_attn –

FlashAttention backend for MLA prefill.
flashinfer –

FlashInfer backend for MLA prefill.
registry –

Registry for MLA prefill backends.
selector –

Selector for MLA prefill backends.
tokenspeed_mla –

TokenSpeed CuTe DSL backend for MLA prefill.
trtllm_ragged –

TRT-LLM Ragged backend for MLA prefill.

Classes:

MLAPrefillBackend –

Abstract base class for MLA prefill backends.
MLAPrefillBackendEnum –

Enumeration of all supported MLA prefill backends.

Functions:

get_mla_prefill_backend –

Select the MLA prefill backend based on configuration and device.
register_mla_prefill_backend –

Register or override an MLA prefill backend implementation.

`MLAPrefillBackend` ¶

Bases: ABC

Abstract base class for MLA prefill backends.

Methods:

prepare_metadata –

Prepare backend-specific metadata before the forward pass.
supports_quant_output –

Whether run_prefill_new_tokens can write quantized output

Source code in vllm/v1/attention/backends/mla/prefill/base.py

class MLAPrefillBackend(ABC):
    """Abstract base class for MLA prefill backends."""

    supported_dtypes: ClassVar[list[torch.dtype]] = [
        torch.float16,
        torch.bfloat16,
    ]
    supported_mla_dimensions: ClassVar[list[MLADimensions]] = []

    @staticmethod
    @abstractmethod
    def get_name() -> str:
        raise NotImplementedError

    @classmethod
    def supports_compute_capability(cls, device_capability: "DeviceCapability") -> bool:
        return True

    @classmethod
    def supports_dtype(cls, dtype: torch.dtype) -> bool:
        return dtype in cls.supported_dtypes

    @classmethod
    def supports_mla_dimensions(cls, mla_dimensions: MLADimensions) -> bool:
        return (
            not cls.supported_mla_dimensions
            or mla_dimensions in cls.supported_mla_dimensions
        )

    @classmethod
    def is_available(cls) -> bool:
        return True

    def supports_quant_output(self, quant_key: "QuantKey") -> bool:
        """Whether `run_prefill_new_tokens` can write quantized output
        directly (fused) for the given quant key, skipping the post-quant
        pass. Overridden by backends that support it."""
        return False

    @classmethod
    def validate_configuration(
        cls,
        device_capability: "DeviceCapability",
        selector_config: "MLAPrefillSelectorConfig",
    ) -> list[str]:
        invalid_reasons: list[str] = []

        if not cls.supports_compute_capability(device_capability):
            invalid_reasons.append(
                f"compute capability {device_capability.major}."
                f"{device_capability.minor} not supported"
            )

        if not cls.supports_dtype(selector_config.dtype):
            invalid_reasons.append(f"dtype {selector_config.dtype} not supported")

        if not cls.is_available():
            invalid_reasons.append("required dependencies not available")

        mla_dimensions = selector_config.mla_dimensions
        if not cls.supports_mla_dimensions(mla_dimensions):
            reason = (
                f"Model does not have supported MLA dimensions (got {mla_dimensions}"
            )
            if (
                cls.supported_mla_dimensions
                and mla_dimensions not in cls.supported_mla_dimensions
            ):
                supported = ", ".join(
                    str(dims) for dims in cls.supported_mla_dimensions
                )
                reason += f"; supported: {supported}"
            invalid_reasons.append(reason + ")")

        return invalid_reasons

    def __init__(
        self,
        num_heads: int,
        scale: float,
        kv_lora_rank: int,
        qk_nope_head_dim: int,
        qk_rope_head_dim: int,
        v_head_dim: int,
        vllm_config: "VllmConfig",
    ) -> None:
        self.num_heads = num_heads
        self.scale = scale
        self.kv_lora_rank = kv_lora_rank
        self.qk_nope_head_dim = qk_nope_head_dim
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.vllm_config = vllm_config

    def clone(self) -> "MLAPrefillBackend":
        return self.__class__(
            num_heads=self.num_heads,
            scale=self.scale,
            kv_lora_rank=self.kv_lora_rank,
            qk_nope_head_dim=self.qk_nope_head_dim,
            qk_rope_head_dim=self.qk_rope_head_dim,
            v_head_dim=self.v_head_dim,
            vllm_config=self.vllm_config,
        )

    def prepare_metadata(  # noqa: B027
        self,
        prefill_metadata: "MLACommonPrefillMetadata",
    ) -> None:
        """Prepare backend-specific metadata before the forward pass.

        Called by the metadata builder after constructing the prefill metadata.
        """
        self._prefill_metadata = prefill_metadata

    @abstractmethod
    def run_prefill_new_tokens(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        return_softmax_lse: bool,
        out: torch.Tensor | None = None,
        output_scale: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

    @abstractmethod
    def run_prefill_context_chunk(
        self,
        chunk_idx: int,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

`prepare_metadata(prefill_metadata)` ¶

Prepare backend-specific metadata before the forward pass.

Called by the metadata builder after constructing the prefill metadata.

Source code in vllm/v1/attention/backends/mla/prefill/base.py

def prepare_metadata(  # noqa: B027
    self,
    prefill_metadata: "MLACommonPrefillMetadata",
) -> None:
    """Prepare backend-specific metadata before the forward pass.

    Called by the metadata builder after constructing the prefill metadata.
    """
    self._prefill_metadata = prefill_metadata

`supports_quant_output(quant_key)` ¶

Whether run_prefill_new_tokens can write quantized output directly (fused) for the given quant key, skipping the post-quant pass. Overridden by backends that support it.

Source code in vllm/v1/attention/backends/mla/prefill/base.py

def supports_quant_output(self, quant_key: "QuantKey") -> bool:
    """Whether `run_prefill_new_tokens` can write quantized output
    directly (fused) for the given quant key, skipping the post-quant
    pass. Overridden by backends that support it."""
    return False

`MLAPrefillBackendEnum` ¶

Bases: Enum

Enumeration of all supported MLA prefill backends.

Methods:

clear_override –

Clear any override for this backend, reverting to the default.
get_class –

Get the backend class (respects overrides).
get_path –

Get the class path for this backend (respects overrides).
is_overridden –

Check if this backend has been overridden.

Source code in vllm/v1/attention/backends/mla/prefill/registry.py

class MLAPrefillBackendEnum(Enum, metaclass=_MLAPrefillBackendEnumMeta):
    """Enumeration of all supported MLA prefill backends."""

    FLASH_ATTN = (
        "vllm.v1.attention.backends.mla.prefill.flash_attn.FlashAttnPrefillBackend"
    )
    FLASHINFER = (
        "vllm.v1.attention.backends.mla.prefill.flashinfer.FlashInferPrefillBackend"
    )
    TRTLLM_RAGGED = (
        "vllm.v1.attention.backends.mla.prefill.trtllm_ragged."
        "TrtllmRaggedPrefillBackend"
    )
    TOKENSPEED_MLA = (
        "vllm.v1.attention.backends.mla.prefill.tokenspeed_mla."
        "TokenspeedMLAPrefillBackend"
    )
    ROCM_AITER_FA = (
        "vllm.v1.attention.backends.mla.prefill.aiter_flash_attn."
        "AiterFlashAttnPrefillBackend"
    )
    # Placeholder for third-party/custom backends - must be registered before use
    # set to None to avoid alias with other backend, whose value is an empty string
    CUSTOM = None

    def get_path(self) -> str:
        """Get the class path for this backend (respects overrides).

        Returns:
            The fully qualified class path string

        Raises:
            ValueError: If Backend.CUSTOM is used without being registered
        """
        path = _MLA_PREFILL_OVERRIDES.get(self, self.value)
        if not path:
            raise ValueError(
                f"MLA prefill backend {self.name} must be registered before "
                f"use. Use register_mla_prefill_backend("
                f"MLAPrefillBackendEnum.{self.name}, "
                f"'your.module.YourClass')"
            )
        return path

    def get_class(self) -> "type[MLAPrefillBackend]":
        """Get the backend class (respects overrides).

        Returns:
            The backend class

        Raises:
            ImportError: If the backend class cannot be imported
            ValueError: If CUSTOM is used without being registered
        """
        return resolve_obj_by_qualname(self.get_path())

    def is_overridden(self) -> bool:
        """Check if this backend has been overridden."""
        return self in _MLA_PREFILL_OVERRIDES

    def clear_override(self) -> None:
        """Clear any override for this backend, reverting to the default."""
        _MLA_PREFILL_OVERRIDES.pop(self, None)

`clear_override()` ¶

Clear any override for this backend, reverting to the default.

Source code in vllm/v1/attention/backends/mla/prefill/registry.py

def clear_override(self) -> None:
    """Clear any override for this backend, reverting to the default."""
    _MLA_PREFILL_OVERRIDES.pop(self, None)

`get_class()` ¶

Get the backend class (respects overrides).

Returns:

type[MLAPrefillBackend] –

The backend class

Raises:

ImportError –

If the backend class cannot be imported
ValueError –

If CUSTOM is used without being registered

Source code in vllm/v1/attention/backends/mla/prefill/registry.py

def get_class(self) -> "type[MLAPrefillBackend]":
    """Get the backend class (respects overrides).

    Returns:
        The backend class

    Raises:
        ImportError: If the backend class cannot be imported
        ValueError: If CUSTOM is used without being registered
    """
    return resolve_obj_by_qualname(self.get_path())

`get_path()` ¶

Get the class path for this backend (respects overrides).

Returns:

str –

The fully qualified class path string

Raises:

ValueError –

If Backend.CUSTOM is used without being registered

Source code in vllm/v1/attention/backends/mla/prefill/registry.py

def get_path(self) -> str:
    """Get the class path for this backend (respects overrides).

    Returns:
        The fully qualified class path string

    Raises:
        ValueError: If Backend.CUSTOM is used without being registered
    """
    path = _MLA_PREFILL_OVERRIDES.get(self, self.value)
    if not path:
        raise ValueError(
            f"MLA prefill backend {self.name} must be registered before "
            f"use. Use register_mla_prefill_backend("
            f"MLAPrefillBackendEnum.{self.name}, "
            f"'your.module.YourClass')"
        )
    return path

`is_overridden()` ¶

Check if this backend has been overridden.

Source code in vllm/v1/attention/backends/mla/prefill/registry.py

def is_overridden(self) -> bool:
    """Check if this backend has been overridden."""
    return self in _MLA_PREFILL_OVERRIDES

`get_mla_prefill_backend(vllm_config)` ¶

Select the MLA prefill backend based on configuration and device.

This function first checks for explicit user preferences via mla_prefill_backend in AttentionConfig, then falls back to automatic priority-based selection.

Parameters:

vllm_config ¶
(VllmConfig) –

The vLLM configuration.

Returns:

type[MLAPrefillBackend] –

The selected prefill backend class.

Source code in vllm/v1/attention/backends/mla/prefill/selector.py

def get_mla_prefill_backend(
    vllm_config: "VllmConfig",
) -> "type[MLAPrefillBackend]":
    """Select the MLA prefill backend based on configuration and device.

    This function first checks for explicit user preferences via
    mla_prefill_backend in AttentionConfig, then falls back to automatic
    priority-based selection.

    Args:
        vllm_config: The vLLM configuration.

    Returns:
        The selected prefill backend class.
    """
    from vllm.platforms import current_platform

    device_capability = current_platform.get_device_capability()
    if device_capability is None:
        logger.info_once(
            "Device capability not available, using FlashAttention MLA prefill backend."
        )
        return MLAPrefillBackendEnum.FLASH_ATTN.get_class()

    attention_config = vllm_config.attention_config

    model_config = vllm_config.model_config
    if model_config is None:
        selector_config = MLAPrefillSelectorConfig(dtype=torch.get_default_dtype())
    else:
        hf_text_config = model_config.hf_text_config
        selector_config = MLAPrefillSelectorConfig(
            dtype=model_config.dtype,
            mla_dimensions=MLADimensions(
                qk_nope_head_dim=getattr(hf_text_config, "qk_nope_head_dim", 0),
                qk_rope_head_dim=getattr(hf_text_config, "qk_rope_head_dim", 0),
                v_head_dim=getattr(hf_text_config, "v_head_dim", 0),
            ),
        )

    if attention_config.mla_prefill_backend is not None:
        selected_backend = attention_config.mla_prefill_backend
        backend_cls: type[MLAPrefillBackend] | None = None
        try:
            backend_cls = selected_backend.get_class()
            invalid_reasons = backend_cls.validate_configuration(
                device_capability, selector_config
            )
        except ImportError:
            invalid_reasons = ["ImportError"]
        if invalid_reasons:
            raise ValueError(
                f"Selected MLA prefill backend {selected_backend.name} "
                f"is not valid for this configuration. "
                f"Reason: {invalid_reasons}"
            )
        assert backend_cls is not None
        logger.info_once("Using %s MLA prefill backend.", selected_backend.name)
        return backend_cls

    return _auto_select_mla_prefill_backend(
        device_capability,
        selector_config,
    )

`register_mla_prefill_backend(backend, class_path=None)` ¶

Register or override an MLA prefill backend implementation.

Parameters:

backend ¶
(MLAPrefillBackendEnum) –

The MLAPrefillBackendEnum member to register.
class_path ¶
(str | None, default: None ) –

Optional class path. If not provided and used as decorator, will be auto-generated from the class.

Returns:

Callable[[type], type] –

Decorator function if class_path is None, otherwise a no-op.

Examples:

Override an existing MLA prefill backend¶

@register_mla_prefill_backend(MLAPrefillBackendEnum.FLASH_ATTN) class MyCustomFlashAttn(MLAPrefillBackend): ...

Register a custom third-party MLA prefill backend¶

@register_mla_prefill_backend(MLAPrefillBackendEnum.CUSTOM) class MyCustomPrefillBackend(MLAPrefillBackend): ...

Direct registration¶

register_mla_prefill_backend( MLAPrefillBackendEnum.CUSTOM, "my.module.MyCustomPrefillBackend" )

Source code in vllm/v1/attention/backends/mla/prefill/registry.py

def register_mla_prefill_backend(
    backend: MLAPrefillBackendEnum,
    class_path: str | None = None,
) -> Callable[[type], type]:
    """Register or override an MLA prefill backend implementation.

    Args:
        backend: The MLAPrefillBackendEnum member to register.
        class_path: Optional class path. If not provided and used as
            decorator, will be auto-generated from the class.

    Returns:
        Decorator function if class_path is None, otherwise a no-op.

    Examples:
        # Override an existing MLA prefill backend
        @register_mla_prefill_backend(MLAPrefillBackendEnum.FLASH_ATTN)
        class MyCustomFlashAttn(MLAPrefillBackend):
            ...

        # Register a custom third-party MLA prefill backend
        @register_mla_prefill_backend(MLAPrefillBackendEnum.CUSTOM)
        class MyCustomPrefillBackend(MLAPrefillBackend):
            ...

        # Direct registration
        register_mla_prefill_backend(
            MLAPrefillBackendEnum.CUSTOM,
            "my.module.MyCustomPrefillBackend"
        )
    """

    def decorator(cls: type) -> type:
        _MLA_PREFILL_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
        return cls

    if class_path is not None:
        _MLA_PREFILL_OVERRIDES[backend] = class_path
        return lambda x: x

    return decorator

`vllm.v1.attention.backends.mla.prefill` ¶

`MLAPrefillBackend` ¶

`prepare_metadata(prefill_metadata)` ¶

`supports_quant_output(quant_key)` ¶

`MLAPrefillBackendEnum` ¶

`clear_override()` ¶

`get_class()` ¶

`get_path()` ¶

`is_overridden()` ¶

`get_mla_prefill_backend(vllm_config)` ¶

`vllm_config` ¶

`register_mla_prefill_backend(backend, class_path=None)` ¶

`backend` ¶

`class_path` ¶

Override an existing MLA prefill backend¶

Register a custom third-party MLA prefill backend¶

Direct registration¶

vllm.v1.attention.backends.mla.prefill ¶

MLAPrefillBackend ¶

prepare_metadata(prefill_metadata) ¶

supports_quant_output(quant_key) ¶

MLAPrefillBackendEnum ¶

clear_override() ¶

get_class() ¶

get_path() ¶

is_overridden() ¶

get_mla_prefill_backend(vllm_config) ¶

vllm_config ¶

register_mla_prefill_backend(backend, class_path=None) ¶

backend ¶

class_path ¶

Override an existing MLA prefill backend¶

Register a custom third-party MLA prefill backend¶

Direct registration¶

`vllm.v1.attention.backends.mla.prefill` ¶

`MLAPrefillBackend` ¶

`prepare_metadata(prefill_metadata)` ¶

`supports_quant_output(quant_key)` ¶

`MLAPrefillBackendEnum` ¶

`clear_override()` ¶

`get_class()` ¶

`get_path()` ¶

`is_overridden()` ¶

`get_mla_prefill_backend(vllm_config)` ¶

`vllm_config` ¶

`register_mla_prefill_backend(backend, class_path=None)` ¶

`backend` ¶

`class_path` ¶