Skip to content

vllm.config

Modules:

Classes:

Functions:

AttentionConfig

Configuration for attention mechanisms in vLLM.

Methods:

Attributes:

Source code in vllm/config/attention.py
@config
class AttentionConfig:
    """Configuration for attention mechanisms in vLLM."""

    backend: AttentionBackendEnum | None = None
    """Attention backend to use. Use "auto" or None for automatic selection."""

    flash_attn_version: Literal[2, 3, 4] | None = None
    """Force vllm to use a specific flash-attention version (2, 3, or 4).
    Only valid when using the flash-attention backend."""

    use_prefill_decode_attention: bool = False
    """Use separate prefill and decode kernels for attention instead of
    the unified triton kernel."""

    flash_attn_max_num_splits_for_cuda_graph: int = 32
    """Flash Attention max number splits for cuda graph decode."""

    tq_max_kv_splits_for_cuda_graph: int = 32
    """TurboQuant max NUM_KV_SPLITS for cuda graph decode.
    Fixes the split count so grid dimensions are constant across captures,
    and buffers can be pre-allocated to avoid inflating the memory estimate."""

    use_trtllm_attention: bool | None = None
    """If set to True/False, use or don't use the TRTLLM attention backend
    in flashinfer. If None, auto-detect the attention backend in flashinfer."""

    disable_flashinfer_q_quantization: bool = False
    """If set, when using fp8 kv, do not quantize Q to fp8."""

    mla_prefill_backend: MLAPrefillBackendEnum | None = None
    """MLA prefill backend to use. If None, will be selected automatically.
    Valid options: FLASH_ATTN (FA3/FA4), FLASHINFER, TRTLLM_RAGGED."""

    use_prefill_query_quantization: bool = False
    """If set, quantize query for attention in prefill."""

    use_fp4_indexer_cache: bool = False
    """If set, use fp4 indexer cache for dsv32 family model (not support yet)"""

    indexer_kv_dtype: IndexerKVDType = "bf16"
    """Data type for the sparse-attention indexer K cache. Quantized formats
    (fp8, mxfp4, nvfp4) require indexer kernel support in the backend."""

    use_non_causal: bool = False
    """Whether to use non-causal (bidirectional) attention."""

    flex_attn_block_m: int | None = None
    """Triton kernel BLOCK_M tile size for flex attention.
    Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1,
    defaults to 16."""

    flex_attn_block_n: int | None = None
    """Triton kernel BLOCK_N tile size for flex attention.
    Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1,
    defaults to 16."""

    flex_attn_q_block_size: int | None = None
    """Logical Q block size for the flex attention block mask.
    Must be a power of 2 and divisible by flex_attn_block_m.
    If None, uses the default (16 on PyTorch >= 2.9, 128 otherwise)."""

    flex_attn_kv_block_size: int | None = None
    """Logical KV block size for the flex attention block mask.
    Must be a power of 2 and divisible by flex_attn_block_n.
    If None, uses the default (kv_cache_block_size on PyTorch >= 2.9,
    128 otherwise)."""

    def compute_hash(self) -> str:
        """
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        from vllm.config.utils import get_hash_factors, hash_factors

        ignored_factors: set[str] = set()
        factors = get_hash_factors(self, ignored_factors)
        return hash_factors(factors)

    @field_validator("backend", mode="before")
    @classmethod
    def validate_backend_before(cls, value: Any) -> Any:
        """Enable parsing of the `backend` enum type from string.

        The special value "auto" is treated as None, which triggers
        automatic backend selection.
        """
        if isinstance(value, str):
            if value.lower() == "auto":
                return None
            return AttentionBackendEnum[value.upper()]
        return value

    @field_validator("mla_prefill_backend", mode="before")
    @classmethod
    def validate_mla_prefill_backend_before(cls, value: Any) -> Any:
        """Enable parsing of the `mla_prefill_backend` enum type from string."""
        if isinstance(value, str):
            return MLAPrefillBackendEnum[value.upper()]
        return value

backend = None class-attribute instance-attribute

Attention backend to use. Use "auto" or None for automatic selection.

disable_flashinfer_q_quantization = False class-attribute instance-attribute

If set, when using fp8 kv, do not quantize Q to fp8.

flash_attn_max_num_splits_for_cuda_graph = 32 class-attribute instance-attribute

Flash Attention max number splits for cuda graph decode.

flash_attn_version = None class-attribute instance-attribute

Force vllm to use a specific flash-attention version (2, 3, or 4). Only valid when using the flash-attention backend.

flex_attn_block_m = None class-attribute instance-attribute

Triton kernel BLOCK_M tile size for flex attention. Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1, defaults to 16.

flex_attn_block_n = None class-attribute instance-attribute

Triton kernel BLOCK_N tile size for flex attention. Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1, defaults to 16.

flex_attn_kv_block_size = None class-attribute instance-attribute

Logical KV block size for the flex attention block mask. Must be a power of 2 and divisible by flex_attn_block_n. If None, uses the default (kv_cache_block_size on PyTorch >= 2.9, 128 otherwise).

flex_attn_q_block_size = None class-attribute instance-attribute

Logical Q block size for the flex attention block mask. Must be a power of 2 and divisible by flex_attn_block_m. If None, uses the default (16 on PyTorch >= 2.9, 128 otherwise).

indexer_kv_dtype = 'bf16' class-attribute instance-attribute

Data type for the sparse-attention indexer K cache. Quantized formats (fp8, mxfp4, nvfp4) require indexer kernel support in the backend.

mla_prefill_backend = None class-attribute instance-attribute

MLA prefill backend to use. If None, will be selected automatically. Valid options: FLASH_ATTN (FA3/FA4), FLASHINFER, TRTLLM_RAGGED.

tq_max_kv_splits_for_cuda_graph = 32 class-attribute instance-attribute

TurboQuant max NUM_KV_SPLITS for cuda graph decode. Fixes the split count so grid dimensions are constant across captures, and buffers can be pre-allocated to avoid inflating the memory estimate.

use_fp4_indexer_cache = False class-attribute instance-attribute

If set, use fp4 indexer cache for dsv32 family model (not support yet)

use_non_causal = False class-attribute instance-attribute

Whether to use non-causal (bidirectional) attention.

use_prefill_decode_attention = False class-attribute instance-attribute

Use separate prefill and decode kernels for attention instead of the unified triton kernel.

use_prefill_query_quantization = False class-attribute instance-attribute

If set, quantize query for attention in prefill.

use_trtllm_attention = None class-attribute instance-attribute

If set to True/False, use or don't use the TRTLLM attention backend in flashinfer. If None, auto-detect the attention backend in flashinfer.

compute_hash()

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/attention.py
def compute_hash(self) -> str:
    """
    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    from vllm.config.utils import get_hash_factors, hash_factors

    ignored_factors: set[str] = set()
    factors = get_hash_factors(self, ignored_factors)
    return hash_factors(factors)

validate_backend_before(value) classmethod

Enable parsing of the backend enum type from string.

The special value "auto" is treated as None, which triggers automatic backend selection.

Source code in vllm/config/attention.py
@field_validator("backend", mode="before")
@classmethod
def validate_backend_before(cls, value: Any) -> Any:
    """Enable parsing of the `backend` enum type from string.

    The special value "auto" is treated as None, which triggers
    automatic backend selection.
    """
    if isinstance(value, str):
        if value.lower() == "auto":
            return None
        return AttentionBackendEnum[value.upper()]
    return value

validate_mla_prefill_backend_before(value) classmethod

Enable parsing of the mla_prefill_backend enum type from string.

Source code in vllm/config/attention.py
@field_validator("mla_prefill_backend", mode="before")
@classmethod
def validate_mla_prefill_backend_before(cls, value: Any) -> Any:
    """Enable parsing of the `mla_prefill_backend` enum type from string."""
    if isinstance(value, str):
        return MLAPrefillBackendEnum[value.upper()]
    return value

CUDAGraphMode

Bases: Enum

Constants for the cudagraph mode in CompilationConfig. Meanwhile, the subset enum NONE, PIECEWISE and FULL are also treated as concrete runtime mode for cudagraph runtime dispatching.

Source code in vllm/config/compilation.py
class CUDAGraphMode(enum.Enum):
    """Constants for the cudagraph mode in CompilationConfig.
    Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also
    treated as concrete runtime mode for cudagraph runtime dispatching.
    """

    NONE = 0
    PIECEWISE = 1
    FULL = 2
    FULL_DECODE_ONLY = (FULL, NONE)
    FULL_AND_PIECEWISE = (FULL, PIECEWISE)

    def decode_mode(self) -> "CUDAGraphMode":
        return CUDAGraphMode(self.value[0]) if self.separate_routine() else self

    def mixed_mode(self) -> "CUDAGraphMode":
        return CUDAGraphMode(self.value[1]) if self.separate_routine() else self

    def has_mode(self, mode: "CUDAGraphMode") -> bool:
        assert not mode.separate_routine()
        if self.separate_routine():
            return mode.value in self.value
        return self == mode

    def requires_piecewise_compilation(self) -> bool:
        return self.has_mode(CUDAGraphMode.PIECEWISE)

    def max_cudagraph_mode(self) -> "CUDAGraphMode":
        return CUDAGraphMode(max(self.value)) if self.separate_routine() else self

    def has_full_cudagraphs(self) -> bool:
        return self.max_cudagraph_mode() == CUDAGraphMode.FULL

    def has_piecewise_cudagraphs(self) -> bool:
        return self.requires_piecewise_compilation()

    def separate_routine(self) -> bool:
        return isinstance(self.value, tuple)

    @classmethod
    def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
        return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})

    def is_valid_runtime_mode(self) -> bool:
        return self in CUDAGraphMode.valid_runtime_modes()

    def __str__(self) -> str:
        return self.name

    def __bool__(self) -> bool:
        return self != CUDAGraphMode.NONE

CacheConfig

Configuration for the KV cache.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

Source code in vllm/config/cache.py
@config
class CacheConfig:
    """Configuration for the KV cache."""

    DEFAULT_BLOCK_SIZE: ClassVar[int] = 16

    block_size: int = Field(default=None, gt=0)  # type: ignore[assignment]
    """Size of a contiguous cache block in number of tokens.
    Accepts None (meaning "use default"). After construction, always int."""
    user_specified_block_size: bool = field(default=False, init=False)
    """Whether block_size was explicitly provided. Derived automatically."""
    user_specified_mamba_block_size: bool = field(default=False, init=False)
    """Whether mamba_block_size was explicitly provided. Derived automatically."""
    hash_block_size: int | None = Field(default=None, gt=0)
    """Block size (in tokens) used for computing Request's block_hashes.

    This can be set to a finer granularity than the physical KV cache block
    sizes (e.g. 8) as long as every KV cache group's `block_size` is divisible
    by it. This enables prefix-caching keys to be computed at the finest common
    granularity and then merged for larger physical block sizes.

    This config is not static default. If left unspecified, vLLM will choose a
    default based on the resolved KV cache groups (typically the smallest KV
    cache block size when there are multiple groups).
    """
    gpu_memory_utilization: float = Field(default=0.92, gt=0, le=1)
    """The fraction of GPU memory to be used for the model executor, which can
    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
    utilization. If unspecified, will use the default value of 0.92. This is a
    per-instance limit, and only applies to the current vLLM instance. It does
    not matter if you have another vLLM instance running on the same GPU. For
    example, if you have two vLLM instances running on the same GPU, you can
    set the GPU memory utilization to 0.5 for each instance."""
    cache_dtype: CacheDType = "auto"
    """Data type for kv cache storage. If "auto", will use model data type.
    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
    fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc).
    Some models (namely DeepSeekV3.2) default to fp8, set to bfloat16 to use
    bfloat16 instead, this is an invalid option for models that do not default
    to fp8.
    """
    is_attention_free: bool = False
    """Whether the model is attention-free. This is primarily set in
    `ModelConfig` and that value should be manually duplicated here."""
    num_gpu_blocks_override: int | None = None
    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
    if specified. Does nothing if `None`. Used for testing preemption."""
    sliding_window: int | None = None
    """Sliding window size for the KV cache. This is primarily set in
    `ModelConfig` and that value should be manually duplicated here."""
    enable_prefix_caching: bool = True
    """Whether to enable prefix caching."""
    prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
    """Set the hash algorithm for prefix caching:

    - "sha256" uses Pickle for object serialization before hashing. This is the current
      default, as SHA256 is the most secure choice to avoid potential hash collisions.
    - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
      serializes objects using canonical CBOR and hashes them with SHA-256.
    - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
      non-cryptographic hashing. Requires the optional ``xxhash`` package.
      IMPORTANT: Use of a hashing algorithm that is not considered  cryptographically
      secure theoretically increases the risk of hash collisions, which can cause
      undefined behavior or even leak private information in multi-tenant environments.
      Even if collisions are still very unlikely, it is important to consider your
      security risk tolerance against the performance benefits before turning this on.
    - "xxhash_cbor" combines canonical CBOR serialization with xxHash for
      reproducible hashing. Requires the optional ``xxhash`` package."""
    calculate_kv_scales: bool = False
    """Deprecated: This option is deprecated and will be removed in v0.19.
    It enables dynamic calculation of `k_scale` and `v_scale` when
    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
    checkpoint if available. Otherwise, the scales will default to 1.0."""
    kv_cache_dtype_skip_layers: list[str] = field(default_factory=list)
    """Layer patterns to skip KV cache quantization. Accepts layer indices
    (e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window')."""
    mamba_page_size_padded: int | None = None
    """ Optional override for mamba page size; used by hybrid mamba/attention
    models to ensure exact alignment with attention page size."""
    mamba_block_size: int | None = Field(default=None, gt=0)
    """Size of a contiguous cache block in number of tokens for mamba cache.
    Can be set only when prefix caching is enabled.
    Value must be a multiple of 8 to align with causal_conv1d kernel."""
    mamba_cache_dtype: MambaDType = "auto"
    """The data type to use for the Mamba cache (both the conv as well as the
    ssm state). If set to 'auto', the data type will be inferred from the model
    config."""
    mamba_ssm_cache_dtype: MambaDType = "auto"
    """The data type to use for the Mamba cache (ssm state only, conv state will
    still be controlled by mamba_cache_dtype). If set to 'auto', the data type
    for the ssm state will be determined by mamba_cache_dtype."""
    mamba_cache_mode: MambaCacheMode = "none"
    """The cache strategy for Mamba layers.
    - "none": set when prefix caching is disabled.
    - "all": cache the mamba state of all tokens at position i * block_size. This is
           the default behavior (for models that support it) when prefix caching is
           enabled.
    - "align": only cache the mamba state of the last token of each scheduler step and
           when the token is at position i * block_size.
    """

    # Will be set after profiling.
    num_gpu_blocks: int | None = field(default=None, init=False)
    """The number of blocks to allocate for GPU memory."""
    num_cpu_blocks: int | None = field(default=None, init=False)
    """The number of blocks to allocate for CPU memory."""

    # Set after KV cache initialization.
    kv_cache_size_tokens: int | None = field(default=None, init=False)
    """Per-DP-engine KV cache capacity in tokens (group-aware). Uses
    group-aware capacity since num_gpu_blocks * block_size can be wrong
    for hybrid models where requests occupy multiple KV cache groups."""
    kv_cache_max_concurrency: float | None = field(default=None, init=False)
    """Per-DP-engine maximum concurrency at max_model_len tokens."""

    kv_sharing_fast_prefill: bool = False
    """This feature is work in progress and no prefill optimization takes place
    with this flag enabled currently.

    In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
    some layers can skip tokens corresponding to prefill. This flag enables
    attention metadata for eligible layers to be overridden with metadata
    necessary for implementing this optimization in some models (e.g. Gemma3n)
    """

    kv_cache_memory_bytes: int | None = None
    """Size of KV Cache per GPU in bytes. By default, this is set to None
    and vllm can automatically infer the kv cache size based on
    gpu_memory_utilization. However, users may want to manually specify
    the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
    control of how much memory gets used when compared with using
    gpu_memory_utilization. Note that kv_cache_memory_bytes
    (when not-None) ignores gpu_memory_utilization"""

    kv_offloading_size: float | None = None
    """Size of the KV cache offloading buffer in GiB. When TP > 1, this is
    the total buffer size summed across all TP ranks. By default, this is set
    to None, which means no KV offloading is enabled. When set, vLLM will
    enable KV cache offloading to CPU using the kv_offloading_backend."""

    kv_offloading_backend: KVOffloadingBackend = "native"
    """The backend to use for KV cache offloading. Supported backends include
    'native' (vLLM native CPU offloading), 'lmcache'.
    KV offloading is only activated when kv_offloading_size is set."""

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        ignored_factors = {
            # Runtime/derived knobs that don't affect compiled graph shape
            "gpu_memory_utilization",
            "is_attention_free",
            "num_gpu_blocks_override",
            "enable_prefix_caching",
            "prefix_caching_hash_algo",
            # Prefix-caching implementation detail (doesn't affect compiled graph).
            "hash_block_size",
            "mamba_page_size_padded",
            "user_specified_block_size",
            "user_specified_mamba_block_size",
            "_block_size_resolved",
            # Post-init/derived counters
            "num_gpu_blocks",
            "num_cpu_blocks",
            "kv_cache_size_tokens",
            "kv_cache_max_concurrency",
            # WIP feature toggle not impacting compiled graph shape
            "kv_sharing_fast_prefill",
        }

        from vllm.config.utils import get_hash_factors, hash_factors

        factors = get_hash_factors(self, ignored_factors)
        return hash_factors(factors)

    def metrics_info(self):
        # convert cache_config to dict(key: str, value: str) for prometheus
        # metrics info
        return {key: str(value) for key, value in self.__dict__.items()}

    _block_size_resolved: bool = field(default=False, init=False)
    """Guard against pydantic re-running _apply_block_size_default."""

    @field_validator("block_size", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        if value is None:
            return value
        return handler(value)

    @model_validator(mode="after")
    def _apply_block_size_default(self) -> "CacheConfig":
        # Pydantic re-runs validators when CacheConfig is nested inside
        # another pydantic model (e.g. VllmConfig). Guard against that.
        if self._block_size_resolved:
            return self
        self._block_size_resolved = True
        if self.block_size is None:
            self.block_size = self.DEFAULT_BLOCK_SIZE
        else:
            self.user_specified_block_size = True
        if self.mamba_block_size is not None:
            self.user_specified_mamba_block_size = True
        return self

    @field_validator("calculate_kv_scales", mode="after")
    @classmethod
    def _warn_deprecated_calculate_kv_scales(cls, calculate_kv_scales: bool) -> bool:
        if calculate_kv_scales:
            logger.warning(
                "The `--calculate-kv-scales` option is deprecated and will "
                "be removed in v0.19. The scales will be loaded from the "
                "model checkpoint if available, otherwise they default to "
                "1.0."
            )
        return calculate_kv_scales

    @field_validator("cache_dtype", mode="after")
    @classmethod
    def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
        if kv_cache_uses_per_token_head_scales(cache_dtype):
            logger.info(
                "Using %s data type to store kv cache. It reduces the GPU "
                "memory footprint and boosts the performance. "
                "Dynamic per-token-head scales will be computed at runtime.",
                str(cache_dtype),
            )
        elif is_quantized_kv_cache(cache_dtype):
            logger.info(
                "Using %s data type to store kv cache. It reduces the GPU "
                "memory footprint and boosts the performance. "
                "Meanwhile, it may cause accuracy drop without a proper "
                "scaling factor",
                str(cache_dtype),
            )
        return cache_dtype

_block_size_resolved = field(default=False, init=False) class-attribute instance-attribute

Guard against pydantic re-running _apply_block_size_default.

block_size = Field(default=None, gt=0) class-attribute instance-attribute

Size of a contiguous cache block in number of tokens. Accepts None (meaning "use default"). After construction, always int.

cache_dtype = 'auto' class-attribute instance-attribute

Data type for kv cache storage. If "auto", will use model data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc). Some models (namely DeepSeekV3.2) default to fp8, set to bfloat16 to use bfloat16 instead, this is an invalid option for models that do not default to fp8.

calculate_kv_scales = False class-attribute instance-attribute

Deprecated: This option is deprecated and will be removed in v0.19. It enables dynamic calculation of k_scale and v_scale when kv_cache_dtype is fp8. If False, the scales will be loaded from the model checkpoint if available. Otherwise, the scales will default to 1.0.

enable_prefix_caching = True class-attribute instance-attribute

Whether to enable prefix caching.

gpu_memory_utilization = Field(default=0.92, gt=0, le=1) class-attribute instance-attribute

The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory utilization. If unspecified, will use the default value of 0.92. This is a per-instance limit, and only applies to the current vLLM instance. It does not matter if you have another vLLM instance running on the same GPU. For example, if you have two vLLM instances running on the same GPU, you can set the GPU memory utilization to 0.5 for each instance.

hash_block_size = Field(default=None, gt=0) class-attribute instance-attribute

Block size (in tokens) used for computing Request's block_hashes.

This can be set to a finer granularity than the physical KV cache block sizes (e.g. 8) as long as every KV cache group's block_size is divisible by it. This enables prefix-caching keys to be computed at the finest common granularity and then merged for larger physical block sizes.

This config is not static default. If left unspecified, vLLM will choose a default based on the resolved KV cache groups (typically the smallest KV cache block size when there are multiple groups).

is_attention_free = False class-attribute instance-attribute

Whether the model is attention-free. This is primarily set in ModelConfig and that value should be manually duplicated here.

kv_cache_dtype_skip_layers = field(default_factory=list) class-attribute instance-attribute

Layer patterns to skip KV cache quantization. Accepts layer indices (e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window').

kv_cache_max_concurrency = field(default=None, init=False) class-attribute instance-attribute

Per-DP-engine maximum concurrency at max_model_len tokens.

kv_cache_memory_bytes = None class-attribute instance-attribute

Size of KV Cache per GPU in bytes. By default, this is set to None and vllm can automatically infer the kv cache size based on gpu_memory_utilization. However, users may want to manually specify the kv cache memory size. kv_cache_memory_bytes allows more fine-grain control of how much memory gets used when compared with using gpu_memory_utilization. Note that kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization

kv_cache_size_tokens = field(default=None, init=False) class-attribute instance-attribute

Per-DP-engine KV cache capacity in tokens (group-aware). Uses group-aware capacity since num_gpu_blocks * block_size can be wrong for hybrid models where requests occupy multiple KV cache groups.

kv_offloading_backend = 'native' class-attribute instance-attribute

The backend to use for KV cache offloading. Supported backends include 'native' (vLLM native CPU offloading), 'lmcache'. KV offloading is only activated when kv_offloading_size is set.

kv_offloading_size = None class-attribute instance-attribute

Size of the KV cache offloading buffer in GiB. When TP > 1, this is the total buffer size summed across all TP ranks. By default, this is set to None, which means no KV offloading is enabled. When set, vLLM will enable KV cache offloading to CPU using the kv_offloading_backend.

kv_sharing_fast_prefill = False class-attribute instance-attribute

This feature is work in progress and no prefill optimization takes place with this flag enabled currently.

In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254), some layers can skip tokens corresponding to prefill. This flag enables attention metadata for eligible layers to be overridden with metadata necessary for implementing this optimization in some models (e.g. Gemma3n)

mamba_block_size = Field(default=None, gt=0) class-attribute instance-attribute

Size of a contiguous cache block in number of tokens for mamba cache. Can be set only when prefix caching is enabled. Value must be a multiple of 8 to align with causal_conv1d kernel.

mamba_cache_dtype = 'auto' class-attribute instance-attribute

The data type to use for the Mamba cache (both the conv as well as the ssm state). If set to 'auto', the data type will be inferred from the model config.

mamba_cache_mode = 'none' class-attribute instance-attribute

The cache strategy for Mamba layers. - "none": set when prefix caching is disabled. - "all": cache the mamba state of all tokens at position i * block_size. This is the default behavior (for models that support it) when prefix caching is enabled. - "align": only cache the mamba state of the last token of each scheduler step and when the token is at position i * block_size.

mamba_page_size_padded = None class-attribute instance-attribute

Optional override for mamba page size; used by hybrid mamba/attention models to ensure exact alignment with attention page size.

mamba_ssm_cache_dtype = 'auto' class-attribute instance-attribute

The data type to use for the Mamba cache (ssm state only, conv state will still be controlled by mamba_cache_dtype). If set to 'auto', the data type for the ssm state will be determined by mamba_cache_dtype.

num_cpu_blocks = field(default=None, init=False) class-attribute instance-attribute

The number of blocks to allocate for CPU memory.

num_gpu_blocks = field(default=None, init=False) class-attribute instance-attribute

The number of blocks to allocate for GPU memory.

num_gpu_blocks_override = None class-attribute instance-attribute

Number of GPU blocks to use. This overrides the profiled num_gpu_blocks if specified. Does nothing if None. Used for testing preemption.

prefix_caching_hash_algo = 'sha256' class-attribute instance-attribute

Set the hash algorithm for prefix caching:

  • "sha256" uses Pickle for object serialization before hashing. This is the current default, as SHA256 is the most secure choice to avoid potential hash collisions.
  • "sha256_cbor" provides a reproducible, cross-language compatible hash. It serializes objects using canonical CBOR and hashes them with SHA-256.
  • "xxhash" uses Pickle serialization with xxHash (128-bit) for faster, non-cryptographic hashing. Requires the optional xxhash package. IMPORTANT: Use of a hashing algorithm that is not considered cryptographically secure theoretically increases the risk of hash collisions, which can cause undefined behavior or even leak private information in multi-tenant environments. Even if collisions are still very unlikely, it is important to consider your security risk tolerance against the performance benefits before turning this on.
  • "xxhash_cbor" combines canonical CBOR serialization with xxHash for reproducible hashing. Requires the optional xxhash package.

sliding_window = None class-attribute instance-attribute

Sliding window size for the KV cache. This is primarily set in ModelConfig and that value should be manually duplicated here.

user_specified_block_size = field(default=False, init=False) class-attribute instance-attribute

Whether block_size was explicitly provided. Derived automatically.

user_specified_mamba_block_size = field(default=False, init=False) class-attribute instance-attribute

Whether mamba_block_size was explicitly provided. Derived automatically.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/cache.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    ignored_factors = {
        # Runtime/derived knobs that don't affect compiled graph shape
        "gpu_memory_utilization",
        "is_attention_free",
        "num_gpu_blocks_override",
        "enable_prefix_caching",
        "prefix_caching_hash_algo",
        # Prefix-caching implementation detail (doesn't affect compiled graph).
        "hash_block_size",
        "mamba_page_size_padded",
        "user_specified_block_size",
        "user_specified_mamba_block_size",
        "_block_size_resolved",
        # Post-init/derived counters
        "num_gpu_blocks",
        "num_cpu_blocks",
        "kv_cache_size_tokens",
        "kv_cache_max_concurrency",
        # WIP feature toggle not impacting compiled graph shape
        "kv_sharing_fast_prefill",
    }

    from vllm.config.utils import get_hash_factors, hash_factors

    factors = get_hash_factors(self, ignored_factors)
    return hash_factors(factors)

CompilationConfig

Configuration for compilation.

You must pass CompilationConfig to VLLMConfig constructor. VLLMConfig's post_init does further initialization. If used outside of the VLLMConfig, some fields will be left in an improper state.

It contains PassConfig, which controls the custom fusion/transformation passes. The rest has three parts:

Why we have different sizes for cudagraph and inductor: - cudagraph: a cudagraph captured for a specific size can only be used for the same size. We need to capture all the sizes we want to use. - inductor: a graph compiled by inductor for a general shape can be used for different sizes. Inductor can also compile for specific sizes, where it can have more information to optimize the graph with fully static shapes. However, we find the general shape compilation is sufficient for most cases. It might be beneficial to compile for certain small batchsizes, where inductor is good at optimizing.

Methods:

Attributes:

Source code in vllm/config/compilation.py
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
@config
class CompilationConfig:
    """Configuration for compilation.

    You must pass CompilationConfig to VLLMConfig constructor.
    VLLMConfig's post_init does further initialization. If used outside of the
    VLLMConfig, some fields will be left in an improper state.

    It contains PassConfig, which controls the custom fusion/transformation passes.
    The rest has three parts:

    - Top-level Compilation control:
        - [`mode`][vllm.config.CompilationConfig.mode]
        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
        - [`backend`][vllm.config.CompilationConfig.backend]
        - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
        - [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
    - CudaGraph capture:
        - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
        - [`cudagraph_capture_sizes`]
        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
        - [`max_cudagraph_capture_size`]
        [vllm.config.CompilationConfig.max_cudagraph_capture_size]
        - [`cudagraph_num_of_warmups`]
        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
        - [`cudagraph_copy_inputs`]
        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
    - Inductor compilation:
        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
        - [`compile_ranges_endpoints`]
            [vllm.config.CompilationConfig.compile_ranges_endpoints]
        - [`inductor_compile_config`]
        [vllm.config.CompilationConfig.inductor_compile_config]
        - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
        - custom inductor passes

    Why we have different sizes for cudagraph and inductor:
    - cudagraph: a cudagraph captured for a specific size can only be used
        for the same size. We need to capture all the sizes we want to use.
    - inductor: a graph compiled by inductor for a general shape can be used
        for different sizes. Inductor can also compile for specific sizes,
        where it can have more information to optimize the graph with fully
        static shapes. However, we find the general shape compilation is
        sufficient for most cases. It might be beneficial to compile for
        certain small batchsizes, where inductor is good at optimizing.
    """

    # Top-level Compilation control
    mode: CompilationMode = None  # type: ignore[assignment]
    """The compilation approach used for torch.compile-based compilation of the
    model.

    - None: If None, we will select the default compilation mode.
      For V1 engine this is 3.
    - 0: NONE: No torch.compile compilation is applied, model runs in fully
         eager pytorch mode. The model runs as-is.
    - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
    - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding
         recompilation by removing guards.
         Requires no dynamic-shape-dependent control-flow.
    - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching,
         piecewise compilation, shape specialization, and custom passes."""
    debug_dump_path: Path | None = None
    """The path to dump the debug information."""
    cache_dir: str = ""
    """The directory to store the compiled graph, to accelerate Inductor
    compilation. By default, it will use model-related information to generate
    a cache directory."""
    compile_cache_save_format: Literal["binary", "unpacked"] = field(
        default_factory=lambda: envs.VLLM_COMPILE_CACHE_SAVE_FORMAT
    )
    """Format for saving torch compile cache:\n
    - "binary": saves as binary file (multiprocess safe)\n
    - "unpacked": saves as directory structure for inspection/debugging
    (NOT multiprocess safe)\n
    Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified.
    """
    backend: str = ""
    """The backend for compilation. It needs to be a string:

    - "" (empty string): use the default backend ("inductor" on CUDA-alike
    platforms).
    - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
    - "full.module.name": a qualified name which can be used to import the

    backend function.
    We use string to avoid serialization issues when using compilation in a
    distributed setting. When the compilation mode is 1 or 2, the backend is
    used for the compilation directly (it sees the whole graph). When the
    compilation mode is 3, the backend supports both whole graph and piecewise
    compilation, available backends include eager, inductor, and custom backends,
    the latter of which can be defined via `get_compile_backend`. Furthermore,
    compilation is only piecewise if splitting ops is set accordingly and
    use_inductor_graph_partition is off. Note that the default options for
    splitting ops are sufficient for piecewise compilation.
    """
    custom_ops: list[str] = field(default_factory=list)
    """Fine-grained control over which custom ops to enable/disable. Use 'all'
    to enable all, 'none' to disable all. Also specify a list of custom op
    names to enable (prefixed with a '+'), or disable (prefixed with a '-').
    Examples:

    - 'all,-op1' to enable all except op1
    - 'none,+op1,+op2' to enable only op1 and op2

    By default, all custom ops are enabled when running without Inductor and
    disabled when running with Inductor: mode>CompilationMode.NONE and
    backend="inductor".
    Inductor generates (fused) Triton kernels for disabled custom ops."""

    ir_enable_torch_wrap: bool = None  # type: ignore[assignment]
    """If True, enable vllm_ir torch custom op wrapping during the forward pass.
    When False, torch custom op wrapping is disabled, allowing Dynamo to trace the
    selected implementation directly or avoiding torch custom op overhead in eager mode.
    Defaults to True when using Inductor with vllm-compile
    (backend=="inductor" and mode == VLLM_COMPILE), False otherwise.
    """

    splitting_ops: list[str] | None = None
    """A list of ops to exclude from cudagraphs, used in piecewise compilation.

    The behavior depends on use_inductor_graph_partition:

    - When use_inductor_graph_partition=False (default):
        These ops are used for Dynamo FX-level graph splitting. The graph is
        split at these ops before Inductor compilation, creating separate
        subgraphs for cudagraph capture.

    - When use_inductor_graph_partition=True:
        These ops are used to register Inductor partition rules. The graph
        partitioning happens at Inductor codegen time after all passes and
        fusions are finished, allowing compilation and custom passes to operate
        on the full graph while still excluding these ops from cudagraphs.

    If None, defaults to attention ops for piecewise cudagraphs.
    If empty list [], no ops are excluded (suitable for full cudagraphs)."""
    compile_mm_encoder: bool = False
    """Whether or not to compile the multimodal encoder.
    Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models on selected
    platforms. It may also work for models loaded with the Transformers modeling backend
    if the encoder is compilable. Disabled by default until more models are
    supported/tested to work."""

    # Vision encoder CUDA graph
    cudagraph_mm_encoder: bool = False
    """Enable CUDA graph capture for multimodal encoder (ViT).
    When enabled, captures full encoder forward as CUDA graph
    for each token budget level."""

    encoder_cudagraph_token_budgets: list[int] = field(default_factory=list)
    """Token budget levels for encoder CUDA graph capture.
    Each budget defines a fixed token capacity. At runtime, images are greedy-packed
    into the smallest fitting budget and the corresponding CUDA graph is replayed.
    If empty (default), auto-inferred from model architecture as power-of-2
    levels from the model's estimated min budget to max budget.
    User-provided values override auto-inference.
    Example: [2048, 4096, 8192, 13824]"""

    encoder_cudagraph_max_vision_items_per_batch: int = 0
    """Maximum number of images/videos per batch for encoder CUDA graph capture.
    Determines the fixed batch size used during graph capture.
    If 0 (default), auto-inferred as max_budget // min_budget from the
    model's budget range. User-provided positive value overrides
    auto-inference."""

    encoder_cudagraph_max_frames_per_batch: int | None = None
    """Maximum total video frames per batch for encoder CUDA graph capture.
    Controls the cu_seqlens buffer size (one entry per attention sequence,
    i.e. one per video frame).
    If None (default), auto-inferred as encoder_cudagraph_max_vision_items_per_batch
    * max_frames_per_video (model-specific value according to processing_info).
    Positive value overrides auto-inference and applies to all budget levels.
    If we limit the video count per prompt to `0`, it will also be set to `0`
    (i.e., fall back to image-only mode)."""

    # Inductor capture
    compile_sizes: list[int | str] | None = None
    """Sizes to compile for inductor. In addition
    to integers, it also supports "cudagraph_capture_sizes" to
    specify the sizes for cudagraph capture."""

    compile_ranges_endpoints: list[int] | None = None
    """Endpoints for Inductor compile ranges.
    The compile ranges are
    [1, endpoints[0]],
    [endpoints[0] + 1, endpoints[1]], ...,
    [endpoints[-1] + 1, max_num_batched_tokens].
    Compile sizes are also used single element ranges,
    the range is represented as [compile_sizes[i], compile_sizes[i]].

    If a range overlaps with the compile size, graph for compile size
    will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
    graph for compile size 4 will be compiled and used instead of the graph
    for range [1, 8].
    """

    inductor_compile_config: dict = field(default_factory=dict)
    """Additional configurations for inductor.
    - None: use default configurations."""

    inductor_passes: dict[str, str] = field(default_factory=dict)
    """Additional passes for inductor. It is a dictionary
    from pass name to pass function qualified name. We use function
    name because the config uses JSON format. If we pass the config
    from Python, functions can also be passed directly via Python object
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
    cudagraph_mode: CUDAGraphMode = None  # type: ignore[assignment]
    """
    The mode of the cudagraph:

    - NONE, no cudagraph capture.
    - PIECEWISE.
    - FULL.
    - FULL_DECODE_ONLY.
    - FULL_AND_PIECEWISE. (v1 default)

    PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
    incompatible ops (i.e. some attention ops) outside the cudagraph
    for general flexibility.

    FULL mode: Capture full cudagraph for all batches. Can be good for small
    models or workloads with small prompts; not supported by many backends.
    Generally for performance FULL_AND_PIECEWISE is better.

    FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only.
    Mixed prefill-decode batches are run without cudagraphs. Can be good for
    decode instances in a P/D setup where prefill is not as important so we
    can save some memory.

    FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
    piecewise cudagraph for prefill and mixed prefill-decode batches.
    This is the most performant mode for most models and is the default.

    Currently, the cudagraph mode is only used for the v1 engine.
    Note that the cudagraph logic is generally orthogonal to the
    compilation logic. While piecewise cudagraphs require piecewise
    compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
    cudagraphs are supported with and without compilation.

    Warning: This flag is new and subject to change in addition
    more modes may be added.
    """
    cudagraph_num_of_warmups: int = 0
    """Number of warmup runs for cudagraph.
    It means the first several runs will be treated as warmup runs.
    Only after that, the execution will be recorded, and the recorded
    cudagraph will be used for subsequent runs."""
    cudagraph_capture_sizes: list[int] = None  # type: ignore[assignment]
    """Sizes to capture cudagraph.
    - None (default): capture sizes are inferred from vllm config.
    - list[int]: capture sizes are specified as given."""
    cudagraph_copy_inputs: bool = False
    """Whether to copy input tensors for
    cudagraph. If the caller can guarantee that the same input buffers
    are always used, it can set this to False. Otherwise, it should
    set this to True, and the compiler will copy the input to an
    internally managed buffer. Default is False.
    Note that this flag is only effective when cudagraph_mode is PIECEWISE.
    """
    cudagraph_specialize_lora: bool = True
    """Whether to create separate cuda graphs for cases with and without active
    LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
    for all cases, incurring the overhead of running LoRA ops even when no
    adapters are active. Setting this to True will remove this overhead at the
    cost of increased startup time and slightly higher memory usage.
    When `enable_lora` is False, this option has no effect.
    """

    use_inductor_graph_partition: bool = None  # type: ignore[assignment]
    """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
    This partition happens at inductor codegen time after all passes and fusions
    are finished. It generates a single `call` function which wraps
    cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops
    outside the partition functions. For a graph with N cudagraph-unsafe ops
    (e.g., Attention), there would be N+1 partitions. To mark an op as
    cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
    register the custom op.

    This config supports both full cudagraph and piecewise cudagraph without
    compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
    to each partition. For N+1 partitions, there would be N+1
    CUDAGraph wrapper instances.

    For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the
    inductor `call` function in the model runner. The top-level full cudagraph
    capture ignores all partitioning.
    """

    pass_config: PassConfig = field(default_factory=PassConfig)
    """Custom inductor passes, see PassConfig for more details"""

    max_cudagraph_capture_size: int = None  # type: ignore[assignment]
    """The maximum cudagraph capture size.

    If cudagraph_capture_sizes is specified, this will be set to the largest
    size in that list (or checked for consistency if specified). If
    cudagraph_capture_sizes is not specified, the list of sizes is generated
    automatically following the pattern:

        [1, 2, 4] + list(range(8, 256, 8)) + list(
        range(256, max_cudagraph_capture_size + 1, 16))

    If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
    512) by default. This voids OOM in tight memory scenarios with small
    max_num_seqs, and prevents capture of many large graphs (>512) that would
    greatly increase startup time with limited performance benefit.
    """

    dynamic_shapes_config: DynamicShapesConfig = field(
        default_factory=DynamicShapesConfig
    )
    """Configuration for dynamic shapes options"""

    local_cache_dir: str = field(default=None, init=False)  # type: ignore
    """local cache dir for each rank"""

    fast_moe_cold_start: bool | None = None
    """Optimization for fast MOE cold start.

    This is a bit of a hack that assumes that:
    1. the only decoder forward pass being run is the current model
    2. the decoder forward pass runs all of the MOEs in the order in which they
       are initialized

    When the above two conditions hold, this option greatly decreases cold start
    time for MOE models.

    The options are:
    - True: optimization is always on
    - False: optimization is always off
    - None: optimization is on usually but off for speculative decoding

    If conditions 1&2 don't hold then this option will lead to silent
    incorrectness.
    The only condition in which this doesn't hold is speculative
    decoding, where there is a draft model that may have MOEs in them.

    NB: We're working on a longer-term solution that doesn't need these assumptions.
    """

    # keep track of enabled and disabled custom ops
    enabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False)
    """custom ops that are enabled"""
    disabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False)
    """custom ops that are disabled"""
    traced_files: set[str] = field(default_factory=set, init=False)
    """files that are traced for compilation"""
    compilation_time: float = field(default=0.0, init=False)
    """time taken for compilation"""
    encoder_compilation_time: float = field(default=0.0, init=False)
    """time taken for multimodal encoder compilation"""

    static_forward_context: dict[str, Any] = field(default_factory=dict, init=False)
    """Per-model forward context
    Map from layer name to layer objects that need to be accessed outside
    model code, e.g., Attention, FusedMOE when dp_size>1."""

    static_all_moe_layers: list[str] = field(default_factory=list, init=False)
    """The names of all the MOE layers in the model
    """

    # Attention ops; used for piecewise cudagraphs
    # Use PyTorch operator format: "namespace::name"
    _attention_ops: ClassVar[list[str]] = [
        "vllm::unified_attention_with_output",
        "vllm::unified_mla_attention_with_output",
        "vllm::mamba_mixer2",
        "vllm::mamba_mixer",
        "vllm::short_conv",
        "vllm::linear_attention",
        "vllm::plamo2_mamba_mixer",
        "vllm::qwen_gdn_attention_core",
        "vllm::gdn_attention_core_xpu",
        "vllm::olmo_hybrid_gdn_full_forward",
        "vllm::kda_attention",
        "vllm::sparse_attn_indexer",
        "vllm::rocm_aiter_sparse_attn_indexer",
        "vllm::deepseek_v4_attention",
    ]

    def compute_hash(self) -> str:
        """
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # Opt-out: default-include declared fields; keep a tiny exclude set;
        # normalize types; keep SHA-256. For nested opaque configs, include a
        # stable identifier (e.g., pass_config.compute_hash()) instead of object id.

        ignored_factors = {
            # Paths/dirs and runtime/metrics that don’t affect compiled graph
            "debug_dump_path",
            "cache_dir",
            "local_cache_dir",
            "traced_files",
            "compilation_time",
            "encoder_compilation_time",
            "static_forward_context",
            "pass_config",  # handled separately below
            "dynamic_shapes_config",  # handled separately below
        }

        from vllm.config.utils import get_hash_factors, hash_factors

        factors = get_hash_factors(self, ignored_factors)

        factors["pass_config"] = self.pass_config.compute_hash()
        factors["dynamic_shapes_config"] = self.dynamic_shapes_config.compute_hash()
        return hash_factors(factors)

    def __repr__(self) -> str:
        exclude: dict[str, bool | dict[str, bool]] = {
            "static_forward_context": True,
            "enabled_custom_ops": True,
            "disabled_custom_ops": True,
            "compilation_time": True,
            "encoder_compilation_time": True,
            "traced_files": True,
            "inductor_compile_config": {
                "post_grad_custom_post_pass": True,
            },
        }

        # exclude default attr in pass_config
        pass_config_exclude = {}
        for attr, default_val in vars(PassConfig()).items():
            if getattr(self.pass_config, attr) == default_val:
                pass_config_exclude[attr] = True
        if pass_config_exclude:
            exclude["pass_config"] = pass_config_exclude

        config = TypeAdapter(CompilationConfig).dump_python(
            self, exclude=exclude, exclude_unset=True
        )

        return str(config)

    __str__ = __repr__

    @field_validator("mode", mode="before")
    @classmethod
    def validate_mode_before(cls, value: Any) -> Any:
        """
        Enable parsing the `mode` field from string mode names.
        Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE,
        DYNAMO_TRACE_ONCE, VLLM_COMPILE.
        """
        if isinstance(value, str):
            # Convert string mode name to integer value
            mode_name = value.upper()

            if mode_name not in CompilationMode.__members__:
                raise ValueError(
                    f"Invalid compilation mode: {value}. "
                    f"Valid modes are: {', '.join(CompilationMode.__members__.keys())}"
                )

            return CompilationMode[mode_name]
        return value

    @field_validator("cudagraph_mode", mode="before")
    @classmethod
    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
        """Enable parsing of the `cudagraph_mode` enum type from string."""
        if isinstance(value, str):
            return CUDAGraphMode[value.upper()]
        return value

    @field_validator("pass_config", mode="before")
    @classmethod
    def validate_pass_config_before(cls, value: Any) -> Any:
        """Enable parsing of the `pass_config` field from a dictionary."""
        if isinstance(value, dict):
            return PassConfig(**value)
        return value

    @field_validator("compile_cache_save_format")
    @classmethod
    def validate_compile_cache_save_format(cls, value: str) -> str:
        if value not in ("binary", "unpacked"):
            raise ValueError(
                f"compile_cache_save_format must be 'binary' or 'unpacked', "
                f"got: {value}"
            )
        return value

    @field_validator(
        "level",
        "mode",
        "cudagraph_mode",
        "max_cudagraph_capture_size",
        "use_inductor_graph_partition",
        "ir_enable_torch_wrap",
        mode="wrap",
    )
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialisation is delayed."""
        if value is None:
            return value
        return handler(value)

    def __post_init__(self) -> None:
        count_none = self.custom_ops.count("none")
        count_all = self.custom_ops.count("all")
        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"

        # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
        # 1. A bug in PyTorch, fixed in 2.7:
        #    https://github.com/pytorch/pytorch/issues/147924
        # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't
        #    work with V2. Addressing this will take extra engineering effort
        #    and it is not yet a priority. RFC here:
        #    https://github.com/vllm-project/vllm/issues/14703

        KEY = "enable_auto_functionalized_v2"
        if KEY not in self.inductor_compile_config:
            self.inductor_compile_config[KEY] = False

        # Tie inductor runtime assertions to debug logging mode.
        # These assertions add ~2ms overhead per forward pass on large
        # models (e.g., DeepSeek-R1 671B: ~340 assert_size_stride + ~60
        # assert_alignment calls per forward). PyTorch >= 2.12 has a
        # native fix (assert-once), so we only apply this workaround on
        # older versions. On torch < 2.12, enable asserts only when
        # VLLM_LOGGING_LEVEL=DEBUG. Users can still override explicitly
        # via --compilation-config '{"inductor_compile_config":
        # {"size_asserts": true, ...}}'.
        # See: https://github.com/pytorch/pytorch/issues/177719
        if not is_torch_equal_or_newer("2.12.0.dev"):
            enable_asserts = envs.VLLM_LOGGING_LEVEL == "DEBUG"
            for key in (
                "size_asserts",
                "alignment_asserts",
                "scalar_asserts",
            ):
                self.inductor_compile_config.setdefault(key, enable_asserts)

        for k, v in self.inductor_passes.items():
            if not isinstance(v, str):
                assert callable(v), f"pass {k} should be callable or a qualified name"
                self.inductor_compile_config[k] = (
                    v if isinstance(v, InductorPass) else CallableInductorPass(v)
                )
                continue

            # resolve function from qualified name
            names = v.split(".")
            module = ".".join(names[:-1])
            func_name = names[-1]
            func = __import__(module).__dict__[func_name]
            self.inductor_compile_config[k] = (
                func if isinstance(func, InductorPass) else CallableInductorPass(func)
            )

        if (
            self.pass_config.enable_qk_norm_rope_fusion
            and "+rotary_embedding" not in self.custom_ops
        ):
            # TODO(zhuhaoran): support rope native forward match and remove this.
            # Linked issue: https://github.com/vllm-project/vllm/issues/28042
            self.custom_ops.append("+rotary_embedding")
        if (
            self.pass_config.fuse_rope_kvcache
            and "+rotary_embedding" not in self.custom_ops
        ):
            # TODO(Rohan138): support rope native forward match and remove this.
            # Linked issue: https://github.com/vllm-project/vllm/issues/28042
            self.custom_ops.append("+rotary_embedding")

        if (
            is_torch_equal_or_newer("2.9.0.dev")
            and "combo_kernels" not in self.inductor_compile_config
            and "benchmark_combo_kernel" not in self.inductor_compile_config
            # (fixme @boyuan) combo kernel does not support cpu yet.
            and not current_platform.is_cpu()
        ):
            # use horizontal fusion, which is useful for fusing qk-norm and
            # qk-rope when query and key have different shapes.
            self.inductor_compile_config["combo_kernels"] = True
            self.inductor_compile_config["benchmark_combo_kernel"] = True

        if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
            "2.9.0.dev"
        ):
            raise ValueError(
                "use_inductor_graph_partition is only "
                "supported with torch>=2.9.0.dev. Set "
                "use_inductor_graph_partition=False instead."
            )

        for op in self.custom_ops:
            if op[0] not in {"+", "-"} and op not in {"all", "none"}:
                raise ValueError(
                    f"Invalid syntax '{op}' for custom op, "
                    "must be 'all', 'none', '+op' or '-op' "
                    "(where 'op' is the registered op name)"
                )

        # Currently only eager and inductor backend are supported.
        # for piecewise compilation. Custom backends are not supported for
        # piecewise compilation. Update when more backends are supported.
        if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
            "",
            "eager",
            "inductor",
        ]:
            raise ValueError(
                f"Invalid backend for piecewise compilation: {self.backend}"
            )

        # Validate encoder CUDA graph configuration
        if (
            self.cudagraph_mm_encoder
            and self.encoder_cudagraph_max_vision_items_per_batch < 0
        ):
            raise ValueError(
                "encoder_cudagraph_max_vision_items_per_batch must be "
                "non-negative (0 = auto-infer)"
            )
        if (
            self.cudagraph_mm_encoder
            and self.encoder_cudagraph_max_frames_per_batch is not None
            and self.encoder_cudagraph_max_frames_per_batch < 0
        ):
            raise ValueError(
                "encoder_cudagraph_max_frames_per_batch must be "
                "non-negative (None = auto-infer)"
            )

        if self.encoder_cudagraph_token_budgets and any(
            b <= 0 for b in self.encoder_cudagraph_token_budgets
        ):
            raise ValueError(
                f"All encoder_cudagraph_token_budgets must be positive, "
                f"got {self.encoder_cudagraph_token_budgets}"
            )

        if self.backend == "":
            self.backend = current_platform.get_compile_backend()

    def init_backend(
        self,
        vllm_config: "VllmConfig",
        prefix: str = "",
        is_encoder: bool = False,
    ) -> str | Callable:
        """
        Initialize the backend for the compilation config from a vllm config.
        Arguments:
            vllm_config: The vllm config to initialize the backend from.
            prefix: Cache directory prefix for this compiled module.
            is_encoder: Whether this module is used in an encoder (as
                opposed to a text backbone).
        Returns:
            The backend for the compilation config.
        """
        if self.mode is None:
            raise ValueError(
                "No compilation mode is set. This method should only be "
                "called via vllm config where the level is set if none is "
                "provided."
            )
        if self.mode == CompilationMode.NONE:
            raise ValueError("No compilation mode is set.")

        from torch._dynamo.backends.registry import list_backends

        torch_backends = list_backends(exclude_tags=tuple())
        if self.mode in [
            CompilationMode.STOCK_TORCH_COMPILE,
            CompilationMode.DYNAMO_TRACE_ONCE,
        ]:
            if self.backend in torch_backends:
                return self.backend
            return resolve_obj_by_qualname(self.backend)

        assert self.mode == CompilationMode.VLLM_COMPILE
        if self.backend not in ["eager", "inductor"]:
            logger.info("Using OOT custom backend for compilation.")

        from vllm.compilation.backends import VllmBackend

        return VllmBackend(vllm_config, prefix=prefix, is_encoder=is_encoder)

    def post_init_cudagraph_sizes(self) -> None:
        """To complete the initialization after cudagraph related
        configs are set. This includes:
        - initialize compile_sizes
        """

        computed_compile_sizes: list[int] = []
        if self.compile_sizes is not None:
            # de-duplicate the sizes provided by the config
            self.compile_sizes = list(set(self.compile_sizes))
            for x in self.compile_sizes:
                if isinstance(x, str):
                    assert x == "cudagraph_capture_sizes", (
                        "Unrecognized size type in compile_sizes, "
                        f"expect 'cudagraph_capture_sizes', got {x}"
                    )
                    computed_compile_sizes.extend(self.cudagraph_capture_sizes)
                else:
                    assert isinstance(x, int)
                    computed_compile_sizes.append(x)
        self.compile_sizes = computed_compile_sizes  # type: ignore

        # make sure the sizes are in ascending order
        self.cudagraph_capture_sizes.sort()
        if self.cudagraph_capture_sizes:
            assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size

    def set_splitting_ops_for_v1(
        self, all2all_backend: str, data_parallel_size: int = 1
    ):
        # To compatible with OOT hardware plugin platform (for example vllm-ascend)
        # which currently only supports sequence parallelism in eager mode.
        if self.mode != CompilationMode.VLLM_COMPILE:
            if self.splitting_ops is None:
                self.splitting_ops = []
            return

        if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
            self.set_splitting_ops_for_attn_fusion()
        else:
            if self.splitting_ops is None:
                # NOTE: When using full cudagraph, instead of setting an empty
                # list and capture the full cudagraph inside the flattened fx
                # graph, we keep the piecewise fx graph structure but capture
                # the full cudagraph outside the fx graph. This reduces some
                # cpu overhead when the runtime batch_size is not cudagraph
                # captured. see https://github.com/vllm-project/vllm/pull/20059
                # for details. Make a copy to avoid mutating the class-level
                # list via reference.
                self.splitting_ops = list(self._attention_ops)

                # unified_kv_cache_update has a string param that prevents Inductor
                # from reusing piecewise graphs. Remove it from the compiled graph.
                # This has the side-effect of excluding cache from cudagraphs but
                # that doesn't seem to affect performance.
                # https://github.com/vllm-project/vllm/issues/33267
                if not self.use_inductor_graph_partition:
                    if self.pass_config.fuse_rope_kvcache:
                        logger.warning_once(
                            "fuse_rope_kvcache is enabled, but splitting_ops is None "
                            "and Inductor graph partition is not enabled."
                            "Disabling fuse_rope_kvcache."
                            "Please either set splitting_ops to an empty list []"
                            "or set use_inductor_graph_partition to True "
                            "to enable RoPE+KV cache fusion."
                        )
                        self.pass_config.fuse_rope_kvcache = False
                    self.splitting_ops.append("vllm::unified_kv_cache_update")
                    self.splitting_ops.append("vllm::unified_mla_kv_cache_update")

            elif len(self.splitting_ops) == 0:
                if (
                    self.cudagraph_mode == CUDAGraphMode.PIECEWISE
                    or self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
                ):
                    logger.warning_once(
                        "Using piecewise cudagraph with empty splitting_ops"
                    )
                if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
                    logger.warning_once(
                        "Piecewise compilation with empty splitting_ops does not "
                        "contain piecewise cudagraph. Setting cudagraph_"
                        "mode to NONE. Hint: If you are using attention "
                        "backends that support cudagraph, consider manually "
                        "setting cudagraph_mode to FULL or FULL_DECODE_ONLY "
                        "to enable full cudagraphs."
                    )
                    self.cudagraph_mode = CUDAGraphMode.NONE
                elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
                    logger.warning_once(
                        "Piecewise compilation with empty splitting_ops does "
                        "not contain piecewise cudagraph. Setting "
                        "cudagraph_mode to FULL."
                    )
                    self.cudagraph_mode = CUDAGraphMode.FULL
                self.splitting_ops = []

        if (
            not self.use_inductor_graph_partition
            and (self.pass_config.enable_sp or self.pass_config.fuse_gemm_comms)
            and self.splitting_ops
        ):
            logger.warning_once(
                "Sequence parallelism requires full-graph compilation when "
                "use_inductor_graph_partition is off. Setting splitting_ops "
                "to an empty list to preserve SP and async TP."
            )
            self.splitting_ops = []
            if self.cudagraph_mode.has_piecewise_cudagraphs():
                logger.warning_once(
                    "Sequence parallelism is incompatible with piecewise "
                    "cudagraph when use_inductor_graph_partition is off. "
                    "Setting cudagraph_mode to FULL."
                )
                self.cudagraph_mode = CUDAGraphMode.FULL

        # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
        if (
            all2all_backend == "deepep_high_throughput"
            and data_parallel_size > 1
            and self.cudagraph_mode != CUDAGraphMode.NONE
        ):
            # TODO: Piecewise Cuda graph might be enabled
            # if torch compile cache key issue fixed
            # See https://github.com/vllm-project/vllm/pull/25093
            logger.info(
                "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels "
                "are optimized for prefill and are incompatible with CUDA Graphs. "
                "In order to use CUDA Graphs for decode-optimized workloads, "
                "use --all2all-backend with another option, such as "
                "deepep_low_latency, nixl_ep, or allgather_reducescatter."
            )
            self.cudagraph_mode = CUDAGraphMode.NONE

    def set_splitting_ops_for_attn_fusion(self):
        assert self.pass_config.fuse_attn_quant
        if self.splitting_ops is None:
            self.splitting_ops = []
            if self.cudagraph_mode.has_piecewise_cudagraphs():
                logger.warning_once(
                    "fuse_attn_quant is incompatible with piecewise "
                    "cudagraph when use_inductor_graph_partition is off. "
                    "In this case, splitting_ops will be set to empty "
                    "list, and cudagraph_mode will be set to FULL. "
                    "Please ensure you are using attention backends that "
                    "support cudagraph or set cudagraph_mode to NONE "
                    "explicitly if encountering any problems."
                )
                self.cudagraph_mode = CUDAGraphMode.FULL

        assert not self.splitting_ops_contain_attention(), (
            "attention ops should not be in splitting_ops when fuse_attn_quant is True"
        )

    def splitting_ops_contain_attention(self) -> bool:
        return self.splitting_ops is not None and all(
            op in self.splitting_ops for op in self._attention_ops
        )

    def splitting_ops_contain_kv_cache_update(self) -> bool:
        # when using Dynamo partition while splitting ops is None
        # and attn+quant fusion disabled, the kv_cache_update_ops are
        # appended to splitting_ops in set_splitting_ops_for_v1 due to
        # https://github.com/vllm-project/vllm/issues/33267
        # In this case, we return True if the kv_cache_update_ops
        # are not in the splitting_ops yet, but will subsequently
        # be added to splitting_ops.
        if (
            not self.use_inductor_graph_partition
            and self.splitting_ops is None
            and not self.pass_config.fuse_attn_quant
        ):
            return True

        kv_cache_update_ops = [
            "vllm::unified_kv_cache_update",
            "vllm::unified_mla_kv_cache_update",
        ]
        return self.splitting_ops is not None and all(
            op in self.splitting_ops for op in kv_cache_update_ops
        )

    def is_attention_compiled_piecewise(self) -> bool:
        if not self.splitting_ops_contain_attention():
            return False

        if not self.use_inductor_graph_partition:
            # Dynamo-level FX split case
            return self.mode == CompilationMode.VLLM_COMPILE

        # Inductor partition case
        return self.backend == "inductor" and self.mode != CompilationMode.NONE

    def custom_op_log_check(self):
        """
        This method logs the enabled/disabled custom ops and checks that the
        passed custom_ops field only contains relevant ops.
        It is called at the end of set_current_vllm_config,
        after the custom ops have been instantiated.
        """

        if len(self.enabled_custom_ops) + len(self.disabled_custom_ops) == 0:
            logger.debug("No custom ops found in model.")
            return

        logger.debug("enabled custom ops: %s", self.enabled_custom_ops)
        logger.debug("disabled custom ops: %s", self.disabled_custom_ops)

        all_ops_in_model = self.enabled_custom_ops | self.disabled_custom_ops
        for op in self.custom_ops:
            if op in {"all", "none"}:
                continue

            assert op[0] in {"+", "-"}, (
                "Invalid custom op syntax (should be checked during init)"
            )

            # check if op name exists in model
            op_name = op[1:]
            if op_name not in all_ops_in_model:
                from vllm.model_executor.custom_op import op_registry

                # Does op exist at all or is it just not present in this model?
                # Note: Only imported op classes appear in the registry.
                missing_str = (
                    "doesn't exist (or wasn't imported/registered)"
                    if op_name not in op_registry
                    else "not present in model"
                )

                enable_str = "enabling" if op[0] == "+" else "disabling"
                logger.warning_once(
                    "Op '%s' %s, %s with '%s' has no effect",
                    op_name,
                    missing_str,
                    enable_str,
                    op,
                )

    def is_custom_op_enabled(self, op: str) -> bool:
        if "all" in self.custom_ops:
            return f"-{op}" not in self.custom_ops

        assert "none" in self.custom_ops
        return f"+{op}" in self.custom_ops

    def resolve_cudagraph_mode_and_sizes(
        self,
        min_cg_support: "AttentionCGSupport",
        min_cg_attn_backend: str | None,
        uniform_decode_query_len: int = 1,
        tensor_parallel_size: int = 1,
        kv_cache_config: "KVCacheConfig | None" = None,
        max_num_reqs: int | None = None,
        is_profiling: bool = False,
    ) -> CUDAGraphMode:
        from vllm.v1.attention.backend import AttentionCGSupport

        cudagraph_mode = self.cudagraph_mode
        if cudagraph_mode is None or cudagraph_mode == CUDAGraphMode.NONE:
            self.cudagraph_mode = CUDAGraphMode.NONE
            return CUDAGraphMode.NONE

        # Check cudagraph for mixed batch is supported
        if (
            cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL
            and min_cg_support != AttentionCGSupport.ALWAYS
        ):
            msg = (
                f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
                f"with {min_cg_attn_backend} backend (support: "
                f"{min_cg_support})"
            )
            if min_cg_support == AttentionCGSupport.NEVER:
                # if not supported any full cudagraphs, just raise it.
                msg += (
                    "; please try cudagraph_mode=PIECEWISE, and "
                    "make sure compilation mode is VLLM_COMPILE"
                )
                raise ValueError(msg)

            # attempt to resolve the full cudagraph related mode
            if self.splitting_ops_contain_attention():
                msg += "; setting cudagraph_mode=FULL_AND_PIECEWISE"
                cudagraph_mode = CUDAGraphMode.FULL_AND_PIECEWISE
            else:
                msg += "; setting cudagraph_mode=FULL_DECODE_ONLY"
                cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
            logger.warning(msg)

        # check that if we are doing decode full-cudagraphs it is supported
        if (
            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
            and min_cg_support == AttentionCGSupport.NEVER
        ):
            msg = (
                f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
                f"with {min_cg_attn_backend} backend (support: "
                f"{min_cg_support})"
            )
            if self.mode == CompilationMode.VLLM_COMPILE and (
                self.splitting_ops_contain_attention()
                or self.use_inductor_graph_partition
            ):
                msg += (
                    "; setting cudagraph_mode=PIECEWISE because "
                    "attention is compiled piecewise"
                )
                cudagraph_mode = CUDAGraphMode.PIECEWISE
            else:
                msg += (
                    "; setting cudagraph_mode=NONE because "
                    "attention is not compiled piecewise"
                )
                cudagraph_mode = CUDAGraphMode.NONE
            logger.warning(msg)

        # check that if we are doing spec-decode + decode full-cudagraphs it is
        # supported
        if (
            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
            and uniform_decode_query_len > 1
            and min_cg_support.value < AttentionCGSupport.UNIFORM_BATCH.value
        ):
            msg = (
                f"CUDAGraphMode.{cudagraph_mode.name} is not supported"
                f" with spec-decode for attention backend "
                f"{min_cg_attn_backend} (support: {min_cg_support})"
            )
            if self.splitting_ops_contain_attention():
                msg += "; setting cudagraph_mode=PIECEWISE"
                cudagraph_mode = CUDAGraphMode.PIECEWISE
            else:
                msg += "; setting cudagraph_mode=NONE"
                cudagraph_mode = CUDAGraphMode.NONE
            logger.warning(msg)

        # double check that we can support full cudagraph if they are requested
        # even after automatic downgrades
        if (
            cudagraph_mode.has_full_cudagraphs()
            and min_cg_support == AttentionCGSupport.NEVER
        ):
            raise ValueError(
                f"CUDAGraphMode.{cudagraph_mode.name} is not "
                f"supported with {min_cg_attn_backend} backend ("
                f"support:{min_cg_support}) "
                "; please try cudagraph_mode=PIECEWISE, "
                "and make sure compilation mode is VLLM_COMPILE"
            )

        # Adjust cudagraph sizes to be a multiple of uniform_decode_query_len
        # to avoid: https://github.com/vllm-project/vllm/issues/28207 and temp-fix:
        # https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
        # Will be removed in the near future when we have separate cudagraph capture
        # sizes for decode and mixed prefill-decode.
        if (
            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
            and uniform_decode_query_len > 1
        ):
            self.adjust_cudagraph_sizes_for_spec_decode(
                uniform_decode_query_len,
                tensor_parallel_size,
            )

        # For Mamba models with FULL decode cudagraphs, each decode
        # sequence needs one Mamba cache block. The decode cudagraph
        # dispatcher already caps batch sizes at max_num_seqs, so we just
        # need to verify that enough blocks exist. Raising here instead
        # of silently capping cudagraph_capture_sizes avoids unintended
        # restrictions on PIECEWISE (prefill) cudagraphs.
        # See: https://github.com/vllm-project/vllm/issues/34094
        if (
            kv_cache_config is not None
            and max_num_reqs is not None
            and cudagraph_mode.has_full_cudagraphs()
            and not is_profiling
            and kv_cache_config.has_mamba_layers
            and max_num_reqs > kv_cache_config.num_blocks
        ):
            raise ValueError(
                f"max_num_seqs ({max_num_reqs}) exceeds available Mamba cache "
                f"blocks ({kv_cache_config.num_blocks}). Each decode sequence "
                "requires one Mamba cache block, so CUDA graph capture cannot "
                "proceed. Please lower max_num_seqs to at most "
                f"{kv_cache_config.num_blocks} or increase "
                "gpu_memory_utilization."
            )

        self.cudagraph_mode = cudagraph_mode
        return cudagraph_mode

    def adjust_cudagraph_sizes_for_spec_decode(
        self, uniform_decode_query_len: int, tensor_parallel_size: int
    ):
        multiple_of = uniform_decode_query_len
        if tensor_parallel_size > 1 and self.pass_config.enable_sp:
            multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
            if (
                multiple_of % uniform_decode_query_len != 0
                or multiple_of % tensor_parallel_size != 0
            ):
                raise ValueError(
                    f"Can't determine cudagraph shapes that are both a "
                    f"multiple of {uniform_decode_query_len} "
                    f"(num_speculative_tokens + 1) required by spec-decode "
                    f"and {tensor_parallel_size} (tensor_parallel_size) "
                    f"required by sequence parallelism please adjust "
                    f"num_speculative_tokens or disable sequence parallelism"
                )

        if not self.cudagraph_capture_sizes or multiple_of <= 1:
            return

        assert self.max_cudagraph_capture_size is not None
        rounded_sizes = sorted(
            set(
                round_up(size, multiple_of)
                for size in self.cudagraph_capture_sizes
                if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
            )
        )

        if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size:
            # if one valid but would be round_down use that
            rounded_sizes = [multiple_of]

        if len(rounded_sizes) == 0:
            raise ValueError(
                f"No valid cudagraph sizes after rounding to multiple of {multiple_of} "
                f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)"
                f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}"
                f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})"
                f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})"
            )

        self.max_cudagraph_capture_size = rounded_sizes[-1]
        self.cudagraph_capture_sizes = rounded_sizes

    def get_compile_ranges(self) -> list[Range]:
        """Get the compile ranges for the compilation config."""
        if self.compile_ranges_endpoints is None:
            return []
        endpoints = sorted(set(self.compile_ranges_endpoints))
        return [Range(s + 1, e) for s, e in zip([0] + endpoints[:-1], endpoints)]

backend = '' class-attribute instance-attribute

The backend for compilation. It needs to be a string:

  • "" (empty string): use the default backend ("inductor" on CUDA-alike platforms).
  • "eager"/"openxla"/...: use the specified backend registered in PyTorch.
  • "full.module.name": a qualified name which can be used to import the

backend function. We use string to avoid serialization issues when using compilation in a distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the compilation mode is 3, the backend supports both whole graph and piecewise compilation, available backends include eager, inductor, and custom backends, the latter of which can be defined via get_compile_backend. Furthermore, compilation is only piecewise if splitting ops is set accordingly and use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation.

cache_dir = '' class-attribute instance-attribute

The directory to store the compiled graph, to accelerate Inductor compilation. By default, it will use model-related information to generate a cache directory.

compilation_time = field(default=0.0, init=False) class-attribute instance-attribute

time taken for compilation

compile_cache_save_format = field(default_factory=(lambda: envs.VLLM_COMPILE_CACHE_SAVE_FORMAT)) class-attribute instance-attribute

Format for saving torch compile cache:

  • "binary": saves as binary file (multiprocess safe)

  • "unpacked": saves as directory structure for inspection/debugging (NOT multiprocess safe)

Defaults to VLLM_COMPILE_CACHE_SAVE_FORMAT if not specified.

compile_mm_encoder = False class-attribute instance-attribute

Whether or not to compile the multimodal encoder. Currently, this only works for Qwen2_5_vl and mLLaMa4 models on selected platforms. It may also work for models loaded with the Transformers modeling backend if the encoder is compilable. Disabled by default until more models are supported/tested to work.

compile_ranges_endpoints = None class-attribute instance-attribute

Endpoints for Inductor compile ranges. The compile ranges are [1, endpoints[0]], [endpoints[0] + 1, endpoints[1]], ..., [endpoints[-1] + 1, max_num_batched_tokens]. Compile sizes are also used single element ranges, the range is represented as [compile_sizes[i], compile_sizes[i]].

If a range overlaps with the compile size, graph for compile size will be prioritized, i.e. if we have a range [1, 8] and a compile size 4, graph for compile size 4 will be compiled and used instead of the graph for range [1, 8].

compile_sizes = None class-attribute instance-attribute

Sizes to compile for inductor. In addition to integers, it also supports "cudagraph_capture_sizes" to specify the sizes for cudagraph capture.

cudagraph_capture_sizes = None class-attribute instance-attribute

Sizes to capture cudagraph. - None (default): capture sizes are inferred from vllm config. - list[int]: capture sizes are specified as given.

cudagraph_copy_inputs = False class-attribute instance-attribute

Whether to copy input tensors for cudagraph. If the caller can guarantee that the same input buffers are always used, it can set this to False. Otherwise, it should set this to True, and the compiler will copy the input to an internally managed buffer. Default is False. Note that this flag is only effective when cudagraph_mode is PIECEWISE.

cudagraph_mm_encoder = False class-attribute instance-attribute

Enable CUDA graph capture for multimodal encoder (ViT). When enabled, captures full encoder forward as CUDA graph for each token budget level.

cudagraph_mode = None class-attribute instance-attribute

The mode of the cudagraph:

  • NONE, no cudagraph capture.
  • PIECEWISE.
  • FULL.
  • FULL_DECODE_ONLY.
  • FULL_AND_PIECEWISE. (v1 default)

PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph incompatible ops (i.e. some attention ops) outside the cudagraph for general flexibility.

FULL mode: Capture full cudagraph for all batches. Can be good for small models or workloads with small prompts; not supported by many backends. Generally for performance FULL_AND_PIECEWISE is better.

FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only. Mixed prefill-decode batches are run without cudagraphs. Can be good for decode instances in a P/D setup where prefill is not as important so we can save some memory.

FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and piecewise cudagraph for prefill and mixed prefill-decode batches. This is the most performant mode for most models and is the default.

Currently, the cudagraph mode is only used for the v1 engine. Note that the cudagraph logic is generally orthogonal to the compilation logic. While piecewise cudagraphs require piecewise compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full cudagraphs are supported with and without compilation.

Warning: This flag is new and subject to change in addition more modes may be added.

cudagraph_num_of_warmups = 0 class-attribute instance-attribute

Number of warmup runs for cudagraph. It means the first several runs will be treated as warmup runs. Only after that, the execution will be recorded, and the recorded cudagraph will be used for subsequent runs.

cudagraph_specialize_lora = True class-attribute instance-attribute

Whether to create separate cuda graphs for cases with and without active LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used for all cases, incurring the overhead of running LoRA ops even when no adapters are active. Setting this to True will remove this overhead at the cost of increased startup time and slightly higher memory usage. When enable_lora is False, this option has no effect.

custom_ops = field(default_factory=list) class-attribute instance-attribute

Fine-grained control over which custom ops to enable/disable. Use 'all' to enable all, 'none' to disable all. Also specify a list of custom op names to enable (prefixed with a '+'), or disable (prefixed with a '-'). Examples:

  • 'all,-op1' to enable all except op1
  • 'none,+op1,+op2' to enable only op1 and op2

By default, all custom ops are enabled when running without Inductor and disabled when running with Inductor: mode>CompilationMode.NONE and backend="inductor". Inductor generates (fused) Triton kernels for disabled custom ops.

debug_dump_path = None class-attribute instance-attribute

The path to dump the debug information.

disabled_custom_ops = field(default_factory=Counter, init=False) class-attribute instance-attribute

custom ops that are disabled

dynamic_shapes_config = field(default_factory=DynamicShapesConfig) class-attribute instance-attribute

Configuration for dynamic shapes options

enabled_custom_ops = field(default_factory=Counter, init=False) class-attribute instance-attribute

custom ops that are enabled

encoder_compilation_time = field(default=0.0, init=False) class-attribute instance-attribute

time taken for multimodal encoder compilation

encoder_cudagraph_max_frames_per_batch = None class-attribute instance-attribute

Maximum total video frames per batch for encoder CUDA graph capture. Controls the cu_seqlens buffer size (one entry per attention sequence, i.e. one per video frame). If None (default), auto-inferred as encoder_cudagraph_max_vision_items_per_batch * max_frames_per_video (model-specific value according to processing_info). Positive value overrides auto-inference and applies to all budget levels. If we limit the video count per prompt to 0, it will also be set to 0 (i.e., fall back to image-only mode).

encoder_cudagraph_max_vision_items_per_batch = 0 class-attribute instance-attribute

Maximum number of images/videos per batch for encoder CUDA graph capture. Determines the fixed batch size used during graph capture. If 0 (default), auto-inferred as max_budget // min_budget from the model's budget range. User-provided positive value overrides auto-inference.

encoder_cudagraph_token_budgets = field(default_factory=list) class-attribute instance-attribute

Token budget levels for encoder CUDA graph capture. Each budget defines a fixed token capacity. At runtime, images are greedy-packed into the smallest fitting budget and the corresponding CUDA graph is replayed. If empty (default), auto-inferred from model architecture as power-of-2 levels from the model's estimated min budget to max budget. User-provided values override auto-inference. Example: [2048, 4096, 8192, 13824]

fast_moe_cold_start = None class-attribute instance-attribute

Optimization for fast MOE cold start.

This is a bit of a hack that assumes that: 1. the only decoder forward pass being run is the current model 2. the decoder forward pass runs all of the MOEs in the order in which they are initialized

When the above two conditions hold, this option greatly decreases cold start time for MOE models.

The options are: - True: optimization is always on - False: optimization is always off - None: optimization is on usually but off for speculative decoding

If conditions 1&2 don't hold then this option will lead to silent incorrectness. The only condition in which this doesn't hold is speculative decoding, where there is a draft model that may have MOEs in them.

NB: We're working on a longer-term solution that doesn't need these assumptions.

inductor_compile_config = field(default_factory=dict) class-attribute instance-attribute

Additional configurations for inductor. - None: use default configurations.

inductor_passes = field(default_factory=dict) class-attribute instance-attribute

Additional passes for inductor. It is a dictionary from pass name to pass function qualified name. We use function name because the config uses JSON format. If we pass the config from Python, functions can also be passed directly via Python object constructor, e.g. CompilationConfig(inductor_passes={"a": func}).

ir_enable_torch_wrap = None class-attribute instance-attribute

If True, enable vllm_ir torch custom op wrapping during the forward pass. When False, torch custom op wrapping is disabled, allowing Dynamo to trace the selected implementation directly or avoiding torch custom op overhead in eager mode. Defaults to True when using Inductor with vllm-compile (backend=="inductor" and mode == VLLM_COMPILE), False otherwise.

local_cache_dir = field(default=None, init=False) class-attribute instance-attribute

local cache dir for each rank

max_cudagraph_capture_size = None class-attribute instance-attribute

The maximum cudagraph capture size.

If cudagraph_capture_sizes is specified, this will be set to the largest size in that list (or checked for consistency if specified). If cudagraph_capture_sizes is not specified, the list of sizes is generated automatically following the pattern:

[1, 2, 4] + list(range(8, 256, 8)) + list(
range(256, max_cudagraph_capture_size + 1, 16))

If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2, 512) by default. This voids OOM in tight memory scenarios with small max_num_seqs, and prevents capture of many large graphs (>512) that would greatly increase startup time with limited performance benefit.

mode = None class-attribute instance-attribute

The compilation approach used for torch.compile-based compilation of the model.

  • None: If None, we will select the default compilation mode. For V1 engine this is 3.
  • 0: NONE: No torch.compile compilation is applied, model runs in fully eager pytorch mode. The model runs as-is.
  • 1: STOCK_TORCH_COMPILE: The standard torch.compile compilation pipeline.
  • 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding recompilation by removing guards. Requires no dynamic-shape-dependent control-flow.
  • 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching, piecewise compilation, shape specialization, and custom passes.

pass_config = field(default_factory=PassConfig) class-attribute instance-attribute

Custom inductor passes, see PassConfig for more details

splitting_ops = None class-attribute instance-attribute

A list of ops to exclude from cudagraphs, used in piecewise compilation.

The behavior depends on use_inductor_graph_partition:

  • When use_inductor_graph_partition=False (default): These ops are used for Dynamo FX-level graph splitting. The graph is split at these ops before Inductor compilation, creating separate subgraphs for cudagraph capture.

  • When use_inductor_graph_partition=True: These ops are used to register Inductor partition rules. The graph partitioning happens at Inductor codegen time after all passes and fusions are finished, allowing compilation and custom passes to operate on the full graph while still excluding these ops from cudagraphs.

If None, defaults to attention ops for piecewise cudagraphs. If empty list [], no ops are excluded (suitable for full cudagraphs).

static_all_moe_layers = field(default_factory=list, init=False) class-attribute instance-attribute

The names of all the MOE layers in the model

static_forward_context = field(default_factory=dict, init=False) class-attribute instance-attribute

Per-model forward context Map from layer name to layer objects that need to be accessed outside model code, e.g., Attention, FusedMOE when dp_size>1.

traced_files = field(default_factory=set, init=False) class-attribute instance-attribute

files that are traced for compilation

use_inductor_graph_partition = None class-attribute instance-attribute

Use inductor graph partition to split the graph at cudagraph_unsafe ops. This partition happens at inductor codegen time after all passes and fusions are finished. It generates a single call function which wraps cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops outside the partition functions. For a graph with N cudagraph-unsafe ops (e.g., Attention), there would be N+1 partitions. To mark an op as cudagraph unsafe, we can add tags=(torch._C.Tag.cudagraph_unsafe) when register the custom op.

This config supports both full cudagraph and piecewise cudagraph without compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper to each partition. For N+1 partitions, there would be N+1 CUDAGraph wrapper instances.

For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the inductor call function in the model runner. The top-level full cudagraph capture ignores all partitioning.

_skip_none_validation(value, handler) classmethod

Skip validation if the value is None when initialisation is delayed.

Source code in vllm/config/compilation.py
@field_validator(
    "level",
    "mode",
    "cudagraph_mode",
    "max_cudagraph_capture_size",
    "use_inductor_graph_partition",
    "ir_enable_torch_wrap",
    mode="wrap",
)
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
    """Skip validation if the value is `None` when initialisation is delayed."""
    if value is None:
        return value
    return handler(value)

compute_hash()

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/compilation.py
def compute_hash(self) -> str:
    """
    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # Opt-out: default-include declared fields; keep a tiny exclude set;
    # normalize types; keep SHA-256. For nested opaque configs, include a
    # stable identifier (e.g., pass_config.compute_hash()) instead of object id.

    ignored_factors = {
        # Paths/dirs and runtime/metrics that don’t affect compiled graph
        "debug_dump_path",
        "cache_dir",
        "local_cache_dir",
        "traced_files",
        "compilation_time",
        "encoder_compilation_time",
        "static_forward_context",
        "pass_config",  # handled separately below
        "dynamic_shapes_config",  # handled separately below
    }

    from vllm.config.utils import get_hash_factors, hash_factors

    factors = get_hash_factors(self, ignored_factors)

    factors["pass_config"] = self.pass_config.compute_hash()
    factors["dynamic_shapes_config"] = self.dynamic_shapes_config.compute_hash()
    return hash_factors(factors)

custom_op_log_check()

This method logs the enabled/disabled custom ops and checks that the passed custom_ops field only contains relevant ops. It is called at the end of set_current_vllm_config, after the custom ops have been instantiated.

Source code in vllm/config/compilation.py
def custom_op_log_check(self):
    """
    This method logs the enabled/disabled custom ops and checks that the
    passed custom_ops field only contains relevant ops.
    It is called at the end of set_current_vllm_config,
    after the custom ops have been instantiated.
    """

    if len(self.enabled_custom_ops) + len(self.disabled_custom_ops) == 0:
        logger.debug("No custom ops found in model.")
        return

    logger.debug("enabled custom ops: %s", self.enabled_custom_ops)
    logger.debug("disabled custom ops: %s", self.disabled_custom_ops)

    all_ops_in_model = self.enabled_custom_ops | self.disabled_custom_ops
    for op in self.custom_ops:
        if op in {"all", "none"}:
            continue

        assert op[0] in {"+", "-"}, (
            "Invalid custom op syntax (should be checked during init)"
        )

        # check if op name exists in model
        op_name = op[1:]
        if op_name not in all_ops_in_model:
            from vllm.model_executor.custom_op import op_registry

            # Does op exist at all or is it just not present in this model?
            # Note: Only imported op classes appear in the registry.
            missing_str = (
                "doesn't exist (or wasn't imported/registered)"
                if op_name not in op_registry
                else "not present in model"
            )

            enable_str = "enabling" if op[0] == "+" else "disabling"
            logger.warning_once(
                "Op '%s' %s, %s with '%s' has no effect",
                op_name,
                missing_str,
                enable_str,
                op,
            )

get_compile_ranges()

Get the compile ranges for the compilation config.

Source code in vllm/config/compilation.py
def get_compile_ranges(self) -> list[Range]:
    """Get the compile ranges for the compilation config."""
    if self.compile_ranges_endpoints is None:
        return []
    endpoints = sorted(set(self.compile_ranges_endpoints))
    return [Range(s + 1, e) for s, e in zip([0] + endpoints[:-1], endpoints)]

init_backend(vllm_config, prefix='', is_encoder=False)

Initialize the backend for the compilation config from a vllm config. Arguments: vllm_config: The vllm config to initialize the backend from. prefix: Cache directory prefix for this compiled module. is_encoder: Whether this module is used in an encoder (as opposed to a text backbone). Returns: The backend for the compilation config.

Source code in vllm/config/compilation.py
def init_backend(
    self,
    vllm_config: "VllmConfig",
    prefix: str = "",
    is_encoder: bool = False,
) -> str | Callable:
    """
    Initialize the backend for the compilation config from a vllm config.
    Arguments:
        vllm_config: The vllm config to initialize the backend from.
        prefix: Cache directory prefix for this compiled module.
        is_encoder: Whether this module is used in an encoder (as
            opposed to a text backbone).
    Returns:
        The backend for the compilation config.
    """
    if self.mode is None:
        raise ValueError(
            "No compilation mode is set. This method should only be "
            "called via vllm config where the level is set if none is "
            "provided."
        )
    if self.mode == CompilationMode.NONE:
        raise ValueError("No compilation mode is set.")

    from torch._dynamo.backends.registry import list_backends

    torch_backends = list_backends(exclude_tags=tuple())
    if self.mode in [
        CompilationMode.STOCK_TORCH_COMPILE,
        CompilationMode.DYNAMO_TRACE_ONCE,
    ]:
        if self.backend in torch_backends:
            return self.backend
        return resolve_obj_by_qualname(self.backend)

    assert self.mode == CompilationMode.VLLM_COMPILE
    if self.backend not in ["eager", "inductor"]:
        logger.info("Using OOT custom backend for compilation.")

    from vllm.compilation.backends import VllmBackend

    return VllmBackend(vllm_config, prefix=prefix, is_encoder=is_encoder)

post_init_cudagraph_sizes()

To complete the initialization after cudagraph related configs are set. This includes: - initialize compile_sizes

Source code in vllm/config/compilation.py
def post_init_cudagraph_sizes(self) -> None:
    """To complete the initialization after cudagraph related
    configs are set. This includes:
    - initialize compile_sizes
    """

    computed_compile_sizes: list[int] = []
    if self.compile_sizes is not None:
        # de-duplicate the sizes provided by the config
        self.compile_sizes = list(set(self.compile_sizes))
        for x in self.compile_sizes:
            if isinstance(x, str):
                assert x == "cudagraph_capture_sizes", (
                    "Unrecognized size type in compile_sizes, "
                    f"expect 'cudagraph_capture_sizes', got {x}"
                )
                computed_compile_sizes.extend(self.cudagraph_capture_sizes)
            else:
                assert isinstance(x, int)
                computed_compile_sizes.append(x)
    self.compile_sizes = computed_compile_sizes  # type: ignore

    # make sure the sizes are in ascending order
    self.cudagraph_capture_sizes.sort()
    if self.cudagraph_capture_sizes:
        assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size

validate_cudagraph_mode_before(value) classmethod

Enable parsing of the cudagraph_mode enum type from string.

Source code in vllm/config/compilation.py
@field_validator("cudagraph_mode", mode="before")
@classmethod
def validate_cudagraph_mode_before(cls, value: Any) -> Any:
    """Enable parsing of the `cudagraph_mode` enum type from string."""
    if isinstance(value, str):
        return CUDAGraphMode[value.upper()]
    return value

validate_mode_before(value) classmethod

Enable parsing the mode field from string mode names. Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE, DYNAMO_TRACE_ONCE, VLLM_COMPILE.

Source code in vllm/config/compilation.py
@field_validator("mode", mode="before")
@classmethod
def validate_mode_before(cls, value: Any) -> Any:
    """
    Enable parsing the `mode` field from string mode names.
    Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE,
    DYNAMO_TRACE_ONCE, VLLM_COMPILE.
    """
    if isinstance(value, str):
        # Convert string mode name to integer value
        mode_name = value.upper()

        if mode_name not in CompilationMode.__members__:
            raise ValueError(
                f"Invalid compilation mode: {value}. "
                f"Valid modes are: {', '.join(CompilationMode.__members__.keys())}"
            )

        return CompilationMode[mode_name]
    return value

validate_pass_config_before(value) classmethod

Enable parsing of the pass_config field from a dictionary.

Source code in vllm/config/compilation.py
@field_validator("pass_config", mode="before")
@classmethod
def validate_pass_config_before(cls, value: Any) -> Any:
    """Enable parsing of the `pass_config` field from a dictionary."""
    if isinstance(value, dict):
        return PassConfig(**value)
    return value

CompilationMode

Bases: IntEnum

The compilation approach used for torch.compile-based compilation of the model.

Attributes:

  • DYNAMO_TRACE_ONCE

    Single Dynamo trace through the model, avoiding recompilation.

  • NONE

    No torch.compile compilation is applied, model runs in fully eager pytorch mode.

  • STOCK_TORCH_COMPILE

    The standard torch.compile compilation pipeline.

  • VLLM_COMPILE

    Custom vLLM Inductor-based backend with caching, piecewise compilation,

Source code in vllm/config/compilation.py
class CompilationMode(enum.IntEnum):
    """The compilation approach used for torch.compile-based compilation of the
    model."""

    NONE = 0
    """No torch.compile compilation is applied, model runs in fully eager pytorch mode.
    The model runs as-is."""
    STOCK_TORCH_COMPILE = 1
    """The standard `torch.compile` compilation pipeline."""
    DYNAMO_TRACE_ONCE = 2
    """Single Dynamo trace through the model, avoiding recompilation."""
    VLLM_COMPILE = 3
    """Custom vLLM Inductor-based backend with caching, piecewise compilation,
    shape specialization, and custom passes."""

DYNAMO_TRACE_ONCE = 2 class-attribute instance-attribute

Single Dynamo trace through the model, avoiding recompilation.

NONE = 0 class-attribute instance-attribute

No torch.compile compilation is applied, model runs in fully eager pytorch mode. The model runs as-is.

STOCK_TORCH_COMPILE = 1 class-attribute instance-attribute

The standard torch.compile compilation pipeline.

VLLM_COMPILE = 3 class-attribute instance-attribute

Custom vLLM Inductor-based backend with caching, piecewise compilation, shape specialization, and custom passes.

DeviceConfig

Configuration for the device to use for vLLM execution.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

  • device (SkipValidation[Device | device | None]) –

    Device type for vLLM execution.

  • device_type (str) –

    Device type from the current platform. This is set in

Source code in vllm/config/device.py
@config(config=ConfigDict(arbitrary_types_allowed=True))
class DeviceConfig:
    """Configuration for the device to use for vLLM execution."""

    device: SkipValidation[Device | torch.device | None] = "auto"
    """Device type for vLLM execution.
    This parameter is deprecated and will be
    removed in a future release.
    It will now be set automatically based
    on the current platform."""
    device_type: str = field(init=False)
    """Device type from the current platform. This is set in
    `__post_init__`."""

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # the device/platform information will be summarized
        # by torch/vllm automatically.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def __post_init__(self):
        if self.device == "auto":
            # Automated device type detection
            from vllm.platforms import current_platform

            self.device_type = current_platform.device_type
            if not self.device_type:
                raise RuntimeError(
                    "Failed to infer device type, please set "
                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
                    "to turn on verbose logging to help debug the issue."
                )
        else:
            # Device type is assigned explicitly
            if isinstance(self.device, str):
                self.device_type = self.device
            elif isinstance(self.device, torch.device):
                self.device_type = self.device.type

        # Some platforms require processing inputs on CPU.
        from vllm.platforms import current_platform

        if (
            current_platform.uses_host_device_handling()
            and self.device_type == current_platform.device_type
        ):
            self.device = None
        else:
            # Set device with device type
            self.device = torch.device(self.device_type)

device = 'auto' class-attribute instance-attribute

Device type for vLLM execution. This parameter is deprecated and will be removed in a future release. It will now be set automatically based on the current platform.

device_type = field(init=False) class-attribute instance-attribute

Device type from the current platform. This is set in __post_init__.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/device.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # the device/platform information will be summarized
    # by torch/vllm automatically.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

DiffusionConfig

Configuration for discrete diffusion language models (dLLMs).

dLLMs generate tokens via iterative denoising over a fixed-length canvas rather than left-to-right autoregressive decoding. They reuse the speculative-decoding data path (draft token ids, scheduled spec decode tokens) with overloaded semantics for block-based generation.

Attributes:

Source code in vllm/config/diffusion.py
@config
class DiffusionConfig:
    """Configuration for discrete diffusion language models (dLLMs).

    dLLMs generate tokens via iterative denoising over a fixed-length canvas
    rather than left-to-right autoregressive decoding. They reuse the
    speculative-decoding data path (draft token ids, scheduled spec decode
    tokens) with overloaded semantics for block-based generation.
    """

    canvas_length: int = Field(default=None, gt=0)  # type: ignore[assignment]
    """Length of the denoising canvas (block).  Also determines the number of
    speculative tokens scheduled per step."""

    max_denoising_steps: int | None = None
    """Maximum number of denoising iterations per canvas block.
    If not set, read from the model's generation_config.json."""

canvas_length = Field(default=None, gt=0) class-attribute instance-attribute

Length of the denoising canvas (block). Also determines the number of speculative tokens scheduled per step.

max_denoising_steps = None class-attribute instance-attribute

Maximum number of denoising iterations per canvas block. If not set, read from the model's generation_config.json.

ECTransferConfig

Configuration for distributed EC cache transfer.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

  • ec_buffer_device (str | None) –

    The device used by ec connector to buffer the EC cache.

  • ec_buffer_size (float) –

    The buffer size for TorchDistributedConnector. Measured in number of

  • ec_connector (str | None) –

    The EC connector for vLLM to transmit EC caches between vLLM instances.

  • ec_connector_extra_config (dict[str, Any]) –

    any extra config that the connector may need.

  • ec_connector_module_path (str | None) –

    The Python module path to dynamically load the EC connector from.

  • ec_ip (str) –

    The EC connector ip, used to build distributed connection.

  • ec_parallel_size (int) –

    The number of parallel instances for EC cache transfer. For

  • ec_port (int) –

    The EC connector port, used to build distributed connection.

  • ec_rank (int | None) –

    The rank of this vLLM instance in the EC cache transfer. Typical value:

  • ec_role (ECRole | None) –

    Whether this vLLM instance produces, consumes EC cache, or both. Choices

  • engine_id (str | None) –

    The engine id for EC transfers.

Source code in vllm/config/ec_transfer.py
@config
class ECTransferConfig:
    """Configuration for distributed EC cache transfer."""

    ec_connector: str | None = None
    """The EC connector for vLLM to transmit EC caches between vLLM instances.
    """

    engine_id: str | None = None
    """The engine id for EC transfers."""

    ec_buffer_device: str | None = "cuda"
    """The device used by ec connector to buffer the EC cache.
    Currently only support 'cuda'."""

    ec_buffer_size: float = 1e9
    """The buffer size for TorchDistributedConnector. Measured in number of
    bytes. Recommended value: 1e9 (about 1GB)."""

    ec_role: ECRole | None = None
    """Whether this vLLM instance produces, consumes EC cache, or both. Choices
    are 'ec_producer', 'ec_consumer', 'ec_both'."""

    ec_rank: int | None = None
    """The rank of this vLLM instance in the EC cache transfer. Typical value:
    0 for encoder, 1 for pd instance.
    Currently only 1P1D is supported."""

    ec_parallel_size: int = 1
    """The number of parallel instances for EC cache transfer. For
    PyNcclConnector, this should be 2."""

    ec_ip: str = "127.0.0.1"
    """The EC connector ip, used to build distributed connection."""

    ec_port: int = 14579
    """The EC connector port, used to build distributed connection."""

    ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
    """any extra config that the connector may need."""

    ec_connector_module_path: str | None = None
    """The Python module path to dynamically load the EC connector from.
    Only supported in V1."""

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def __post_init__(self) -> None:
        if self.engine_id is None:
            self.engine_id = str(uuid.uuid4())

        if self.ec_role is not None and self.ec_role not in get_args(ECRole):
            raise ValueError(
                f"Unsupported ec_role: {self.ec_role}. "
                f"Supported roles are {get_args(ECRole)}"
            )

        if self.ec_connector is not None and self.ec_role is None:
            raise ValueError(
                "Please specify ec_role when ec_connector "
                f"is set, supported roles are {get_args(ECRole)}"
            )

    @property
    def is_ec_transfer_instance(self) -> bool:
        return self.ec_connector is not None and self.ec_role in get_args(ECRole)

    @property
    def is_ec_producer(self) -> bool:
        return self.ec_connector is not None and self.ec_role in get_args(ECProducer)

    @property
    def is_ec_consumer(self) -> bool:
        return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)

    def get_from_extra_config(self, key, default) -> Any:
        return self.ec_connector_extra_config.get(key, default)

ec_buffer_device = 'cuda' class-attribute instance-attribute

The device used by ec connector to buffer the EC cache. Currently only support 'cuda'.

ec_buffer_size = 1000000000.0 class-attribute instance-attribute

The buffer size for TorchDistributedConnector. Measured in number of bytes. Recommended value: 1e9 (about 1GB).

ec_connector = None class-attribute instance-attribute

The EC connector for vLLM to transmit EC caches between vLLM instances.

ec_connector_extra_config = field(default_factory=dict) class-attribute instance-attribute

any extra config that the connector may need.

ec_connector_module_path = None class-attribute instance-attribute

The Python module path to dynamically load the EC connector from. Only supported in V1.

ec_ip = '127.0.0.1' class-attribute instance-attribute

The EC connector ip, used to build distributed connection.

ec_parallel_size = 1 class-attribute instance-attribute

The number of parallel instances for EC cache transfer. For PyNcclConnector, this should be 2.

ec_port = 14579 class-attribute instance-attribute

The EC connector port, used to build distributed connection.

ec_rank = None class-attribute instance-attribute

The rank of this vLLM instance in the EC cache transfer. Typical value: 0 for encoder, 1 for pd instance. Currently only 1P1D is supported.

ec_role = None class-attribute instance-attribute

Whether this vLLM instance produces, consumes EC cache, or both. Choices are 'ec_producer', 'ec_consumer', 'ec_both'.

engine_id = None class-attribute instance-attribute

The engine id for EC transfers.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/ec_transfer.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

EPLBConfig

Configuration for Expert Parallel Load Balancing (EP).

Attributes:

Source code in vllm/config/parallel.py
@config
class EPLBConfig:
    """Configuration for Expert Parallel Load Balancing (EP)."""

    window_size: int = Field(default=1000, gt=0)
    """Window size for expert load recording."""
    step_interval: int = Field(default=3000, gt=0)
    """
    Interval for rearranging experts in expert parallelism.

    Note that if this is greater than the EPLB window size, only the metrics
    of the last `lb_window_size` steps will be used for rearranging experts.
    """

    num_redundant_experts: int = Field(default=0, ge=0)
    """Number of redundant experts to use for expert parallelism."""

    log_balancedness: bool = False
    """
    Log the balancedness each step of expert parallelism.
    This is turned off by default since it will cause communication overhead.
    """
    log_balancedness_interval: int = Field(default=1, gt=0)
    """
    Interval for logging the balancedness.
    """
    use_async: bool = True
    """
    Whether to use non-blocking EPLB.
    """

    policy: EPLBPolicyOption = "default"
    """The policy type for expert parallel load balancing (EPLB)."""

    communicator: EPLBCommunicatorBackend | None = None
    """
    Backend for EPLB expert weight communication:
    - "torch_nccl": Use torch.distributed on the device process group
    - "torch_gloo": Use torch.distributed gloo with CPU staging
    - "nixl": Use NIXL/ RIXL with staged send/recv buffers
    - "pynccl": Use PyNccl send/recv
    - None: Auto-select backend (prefers "nixl", falls back to "torch_gloo")
    """

    @model_validator(mode="after")
    def _validate_eplb_config(self) -> Self:
        if self.use_async and self.policy != "default":
            raise ValueError("Async EPLB is only supported with the default policy.")
        if self.use_async and self.communicator in ("torch_nccl", "pynccl"):
            raise ValueError(
                f"{self.communicator} communicator is incompatible with "
                "async EPLB due to NCCL multi-stream conflicts. Use "
                "'torch_gloo' or 'nixl' instead, or leave communicator "
                "unset for automatic selection."
            )
        if self.log_balancedness and self.log_balancedness_interval <= 0:
            raise ValueError("log_balancedness_interval must be greater than 0.")
        return self

communicator = None class-attribute instance-attribute

Backend for EPLB expert weight communication: - "torch_nccl": Use torch.distributed on the device process group - "torch_gloo": Use torch.distributed gloo with CPU staging - "nixl": Use NIXL/ RIXL with staged send/recv buffers - "pynccl": Use PyNccl send/recv - None: Auto-select backend (prefers "nixl", falls back to "torch_gloo")

log_balancedness = False class-attribute instance-attribute

Log the balancedness each step of expert parallelism. This is turned off by default since it will cause communication overhead.

log_balancedness_interval = Field(default=1, gt=0) class-attribute instance-attribute

Interval for logging the balancedness.

num_redundant_experts = Field(default=0, ge=0) class-attribute instance-attribute

Number of redundant experts to use for expert parallelism.

policy = 'default' class-attribute instance-attribute

The policy type for expert parallel load balancing (EPLB).

step_interval = Field(default=3000, gt=0) class-attribute instance-attribute

Interval for rearranging experts in expert parallelism.

Note that if this is greater than the EPLB window size, only the metrics of the last lb_window_size steps will be used for rearranging experts.

use_async = True class-attribute instance-attribute

Whether to use non-blocking EPLB.

window_size = Field(default=1000, gt=0) class-attribute instance-attribute

Window size for expert load recording.

KVEventsConfig

Configuration for KV event publishing.

Attributes:

  • buffer_steps (int) –

    The number of steps to cache for replay endpoint. Will only save

  • enable_kv_cache_events (bool) –

    If True, enable KV cache events for tracking block storage and removal.

  • endpoint (str) –

    The zmq endpoint to use for publishing kv events.

  • hwm (int) –

    The zmq high water mark for the event publisher. After queueing N events,

  • max_queue_size (int) –

    The maximum number of events to queue while waiting for publishing.

  • publisher (Literal['null', 'zmq']) –

    The publisher to use for publishing kv events. Can be "null", "zmq".

  • replay_endpoint (str | None) –

    The zmq endpoint to use for replaying kv events.

  • topic (str) –

    The topic to use for the event publisher. Consumers can subscribe to

Source code in vllm/config/kv_events.py
@config
class KVEventsConfig:
    """Configuration for KV event publishing."""

    enable_kv_cache_events: bool = False
    """If True, enable KV cache events for tracking block storage and removal.
    Events can be published externally by zmq using the event publisher config.
    """

    publisher: Literal["null", "zmq"] = None  # type: ignore[assignment]
    """The publisher to use for publishing kv events. Can be "null", "zmq".
    """

    endpoint: str = "tcp://*:5557"
    """The zmq endpoint to use for publishing kv events.
    """

    replay_endpoint: str | None = None
    """The zmq endpoint to use for replaying kv events.
    """

    buffer_steps: int = 10_000
    """The number of steps to cache for replay endpoint. Will only save
    events from the last N steps for the replay endpoint.
    """

    hwm: int = 100_000
    """The zmq high water mark for the event publisher. After queueing N events,
    events will start dropping if the consumer is not keeping up.
    """

    max_queue_size: int = 100_000
    """The maximum number of events to queue while waiting for publishing.
    """

    topic: str = ""
    """The topic to use for the event publisher. Consumers can subscribe to
    this topic to receive events.
    """

    def __post_init__(self):
        if self.publisher is None:
            self.publisher = "zmq" if self.enable_kv_cache_events else "null"

buffer_steps = 10000 class-attribute instance-attribute

The number of steps to cache for replay endpoint. Will only save events from the last N steps for the replay endpoint.

enable_kv_cache_events = False class-attribute instance-attribute

If True, enable KV cache events for tracking block storage and removal. Events can be published externally by zmq using the event publisher config.

endpoint = 'tcp://*:5557' class-attribute instance-attribute

The zmq endpoint to use for publishing kv events.

hwm = 100000 class-attribute instance-attribute

The zmq high water mark for the event publisher. After queueing N events, events will start dropping if the consumer is not keeping up.

max_queue_size = 100000 class-attribute instance-attribute

The maximum number of events to queue while waiting for publishing.

publisher = None class-attribute instance-attribute

The publisher to use for publishing kv events. Can be "null", "zmq".

replay_endpoint = None class-attribute instance-attribute

The zmq endpoint to use for replaying kv events.

topic = '' class-attribute instance-attribute

The topic to use for the event publisher. Consumers can subscribe to this topic to receive events.

KVTransferConfig

Configuration for distributed KV cache transfer.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

Source code in vllm/config/kv_transfer.py
@config
class KVTransferConfig:
    """Configuration for distributed KV cache transfer."""

    kv_connector: str | None = None
    """The KV connector for vLLM to transmit KV caches between vLLM instances.
    """

    engine_id: str | None = None
    """The engine id for KV transfers."""

    kv_buffer_device: str = field(default_factory=kv_buffer_device_default_factory)
    """The device used by kv connector to buffer the KV cache. Choices are
    'cuda', 'cpu' and 'xpu'."""

    kv_buffer_size: float = 1e9
    """The buffer size for TorchDistributedConnector. Measured in number of
    bytes. Recommended value: 1e9 (about 1GB)."""

    kv_role: KVRole | None = None
    """Whether this vLLM instance produces, consumes KV cache, or both. Choices
    are 'kv_producer', 'kv_consumer', and 'kv_both'."""

    kv_rank: int | None = None
    """The rank of this vLLM instance in the KV cache transfer. Typical value:
    0 for prefill instance, 1 for decode instance.
    Currently only 1P1D is supported."""

    kv_parallel_size: int = 1
    """The number of parallel instances for KV cache transfer."""

    kv_ip: str = "127.0.0.1"
    """The KV connector ip, used to build distributed connection."""

    kv_port: int = 14579
    """The KV connector port, used to build distributed connection."""

    kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
    """any extra config that the connector may need."""

    kv_connector_module_path: str | None = None
    """The Python module path to dynamically load the KV connector from.
    Only supported in V1."""

    enable_permute_local_kv: bool = False
    """Experiment feature flag to enable HND to NHD KV Transfer"""

    kv_load_failure_policy: Literal["recompute", "fail"] = "fail"
    """Policy for handling KV cache load failures.
    'recompute': reschedule the request to recompute failed blocks
    'fail': immediately fail the request with an error finish reason (default)"""

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def __post_init__(self) -> None:
        if self.engine_id is None:
            self.engine_id = str(uuid.uuid4())

        if self.kv_role is not None and self.kv_role not in get_args(KVRole):
            raise ValueError(
                f"Unsupported kv_role: {self.kv_role}. "
                f"Supported roles are {get_args(KVRole)}"
            )

        if self.kv_connector is not None and self.kv_role is None:
            raise ValueError(
                "Please specify kv_role when kv_connector "
                f"is set, supported roles are {get_args(KVRole)}"
            )

    @property
    def is_kv_transfer_instance(self) -> bool:
        return self.kv_connector is not None and self.kv_role in get_args(KVRole)

    @property
    def is_kv_producer(self) -> bool:
        return self.kv_connector is not None and self.kv_role in get_args(KVProducer)

    @property
    def is_kv_consumer(self) -> bool:
        return self.kv_connector is not None and self.kv_role in get_args(KVConsumer)

    def get_from_extra_config(self, key, default) -> Any:
        return self.kv_connector_extra_config.get(key, default)

enable_permute_local_kv = False class-attribute instance-attribute

Experiment feature flag to enable HND to NHD KV Transfer

engine_id = None class-attribute instance-attribute

The engine id for KV transfers.

kv_buffer_device = field(default_factory=kv_buffer_device_default_factory) class-attribute instance-attribute

The device used by kv connector to buffer the KV cache. Choices are 'cuda', 'cpu' and 'xpu'.

kv_buffer_size = 1000000000.0 class-attribute instance-attribute

The buffer size for TorchDistributedConnector. Measured in number of bytes. Recommended value: 1e9 (about 1GB).

kv_connector = None class-attribute instance-attribute

The KV connector for vLLM to transmit KV caches between vLLM instances.

kv_connector_extra_config = field(default_factory=dict) class-attribute instance-attribute

any extra config that the connector may need.

kv_connector_module_path = None class-attribute instance-attribute

The Python module path to dynamically load the KV connector from. Only supported in V1.

kv_ip = '127.0.0.1' class-attribute instance-attribute

The KV connector ip, used to build distributed connection.

kv_load_failure_policy = 'fail' class-attribute instance-attribute

Policy for handling KV cache load failures. 'recompute': reschedule the request to recompute failed blocks 'fail': immediately fail the request with an error finish reason (default)

kv_parallel_size = 1 class-attribute instance-attribute

The number of parallel instances for KV cache transfer.

kv_port = 14579 class-attribute instance-attribute

The KV connector port, used to build distributed connection.

kv_rank = None class-attribute instance-attribute

The rank of this vLLM instance in the KV cache transfer. Typical value: 0 for prefill instance, 1 for decode instance. Currently only 1P1D is supported.

kv_role = None class-attribute instance-attribute

Whether this vLLM instance produces, consumes KV cache, or both. Choices are 'kv_producer', 'kv_consumer', and 'kv_both'.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/kv_transfer.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

KernelConfig

Configuration for kernel selection and warmup behavior.

Methods:

Attributes:

Source code in vllm/config/kernel.py
@config
class KernelConfig:
    """Configuration for kernel selection and warmup behavior."""

    ir_op_priority: IrOpPriorityConfig = Field(default_factory=IrOpPriorityConfig)
    """
    vLLM IR op priority for dispatching/lowering during the forward pass.
    Platform defaults appended automatically during VllmConfig.__post_init__.
    """

    enable_flashinfer_autotune: bool = None  # type: ignore[assignment]
    """If True, run FlashInfer autotuning during kernel warmup."""

    moe_backend: MoEBackend = "auto"
    """Backend for MoE expert computation kernels. Available options:

    - "auto": Automatically select the best backend based on model and hardware
    - "triton": Use Triton-based fused MoE kernels
    - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
    - "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
    - "cutlass": Use vLLM CUTLASS kernels
    - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
    - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
    - "flashinfer_b12x": Use FlashInfer CuteDSL fused MoE for SM12x
      (RTX Pro 6000 / DGX Spark)
    - "marlin": Use Marlin kernels (weight-only quantization)
    - "humming": Use Humming Mixed Precision kernels
    - "triton_unfused": Use Triton unfused MoE kernels
    - "aiter": Use AMD AITer kernels (ROCm only)
    - "flydsl": Use AMD FlyDSL kernels (ROCm only)
    - "hpc": Use HPC kernels (FP8 and Hopper only)
    - "emulation": use BF16/FP16 GEMM, dequantizing weights and
                   running QDQ on activations.
    """

    linear_backend: LinearBackend = "auto"
    """Backend for quantized linear layer GEMM kernels. Available options:

    - "auto": Automatically select the best backend based on model and hardware
    - "cutlass": Use CUTLASS-based kernels
    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
    - "flashinfer_cutedsl": Use FlashInfer with CuTe-DSL kernels (NVFP4, MXFP8)
    - "flashinfer_trtllm": Use FlashInfer with TensorRT-LLM kernels
    - "flashinfer_cudnn": Use FlashInfer with cuDNN kernels
    - "flashinfer_b12x": Use FlashInfer b12x CuteDSL NVFP4 GEMM (SM120+)
    - "marlin": Use Marlin kernels
    - "triton": Use Triton-based kernels
    - "deep_gemm": Use DeepGEMM kernels
    - "torch": Use PyTorch native scaled_mm kernels
    - "aiter": Use AMD AITer kernels (ROCm only)
    - "machete": Use Machete kernels (mixed-precision)
    - "fbgemm": Use FBGEMM kernels
    - "conch": Use Conch mixed-precision kernels
    - "exllama": Use Exllama mixed-precision kernels
    - "emulation": Use slow dequant-to-BF16 emulation (for testing only)"""

    @field_validator("moe_backend", mode="before")
    @classmethod
    def _normalize_moe_backend(cls, value: Any) -> Any:
        if isinstance(value, str):
            return value.lower().replace("-", "_")
        return value

    @field_validator("linear_backend", mode="before")
    @classmethod
    def _normalize_linear_backend(cls, value: Any) -> Any:
        if isinstance(value, str):
            return value.lower().replace("-", "_")
        return value

    def compute_hash(self) -> str:
        """
        Produces a hash unique to the pass configuration.
        Any new fields that affect compilation should be added to the hash.
        Any future fields that don't affect compilation should be excluded.
        """
        ignored_factors = {
            "enable_flashinfer_autotune",
            "ir_op_priority",  # handled separately below
        }
        factors = get_hash_factors(self, ignored_factors)
        factors["ir_op_priority"] = self.ir_op_priority.compute_hash()
        return hash_factors(factors)

    @field_validator("enable_flashinfer_autotune", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialization is delayed."""
        if value is None:
            return value
        return handler(value)

    def set_platform_defaults(self, vllm_config: "VllmConfig") -> None:
        """Set platform-specific defaults for the kernel config."""
        from vllm.platforms import current_platform

        platform_op_priority = current_platform.get_default_ir_op_priority(vllm_config)
        logger.debug(
            "Setting platform-specific IR op priority defaults: %s, user-defined: %s",
            platform_op_priority,
            self.ir_op_priority,
        )
        for op_name, op_priority in asdict(platform_op_priority).items():
            current_op_priority: list[str] = getattr(self.ir_op_priority, op_name)
            if current_op_priority is None:
                setattr(self.ir_op_priority, op_name, op_priority)
            else:
                # Append platform-specific priorities
                # Must be idempotent because vllm_config.set_platform_defaults() may be
                # called multiple times (due to VllmConfig.__post_init__ manual call).
                unique_op_priority = [
                    op for op in op_priority if op not in current_op_priority
                ]
                current_op_priority.extend(unique_op_priority)

        logger.info(
            "Final IR op priority after setting platform defaults: %s",
            self.ir_op_priority,
        )

enable_flashinfer_autotune = None class-attribute instance-attribute

If True, run FlashInfer autotuning during kernel warmup.

ir_op_priority = Field(default_factory=IrOpPriorityConfig) class-attribute instance-attribute

vLLM IR op priority for dispatching/lowering during the forward pass. Platform defaults appended automatically during VllmConfig.post_init.

linear_backend = 'auto' class-attribute instance-attribute

Backend for quantized linear layer GEMM kernels. Available options:

  • "auto": Automatically select the best backend based on model and hardware
  • "cutlass": Use CUTLASS-based kernels
  • "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
  • "flashinfer_cutedsl": Use FlashInfer with CuTe-DSL kernels (NVFP4, MXFP8)
  • "flashinfer_trtllm": Use FlashInfer with TensorRT-LLM kernels
  • "flashinfer_cudnn": Use FlashInfer with cuDNN kernels
  • "flashinfer_b12x": Use FlashInfer b12x CuteDSL NVFP4 GEMM (SM120+)
  • "marlin": Use Marlin kernels
  • "triton": Use Triton-based kernels
  • "deep_gemm": Use DeepGEMM kernels
  • "torch": Use PyTorch native scaled_mm kernels
  • "aiter": Use AMD AITer kernels (ROCm only)
  • "machete": Use Machete kernels (mixed-precision)
  • "fbgemm": Use FBGEMM kernels
  • "conch": Use Conch mixed-precision kernels
  • "exllama": Use Exllama mixed-precision kernels
  • "emulation": Use slow dequant-to-BF16 emulation (for testing only)

moe_backend = 'auto' class-attribute instance-attribute

Backend for MoE expert computation kernels. Available options:

  • "auto": Automatically select the best backend based on model and hardware
  • "triton": Use Triton-based fused MoE kernels
  • "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
  • "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
  • "cutlass": Use vLLM CUTLASS kernels
  • "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
  • "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
  • "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
  • "flashinfer_b12x": Use FlashInfer CuteDSL fused MoE for SM12x (RTX Pro 6000 / DGX Spark)
  • "marlin": Use Marlin kernels (weight-only quantization)
  • "humming": Use Humming Mixed Precision kernels
  • "triton_unfused": Use Triton unfused MoE kernels
  • "aiter": Use AMD AITer kernels (ROCm only)
  • "flydsl": Use AMD FlyDSL kernels (ROCm only)
  • "hpc": Use HPC kernels (FP8 and Hopper only)
  • "emulation": use BF16/FP16 GEMM, dequantizing weights and running QDQ on activations.

_skip_none_validation(value, handler) classmethod

Skip validation if the value is None when initialization is delayed.

Source code in vllm/config/kernel.py
@field_validator("enable_flashinfer_autotune", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
    """Skip validation if the value is `None` when initialization is delayed."""
    if value is None:
        return value
    return handler(value)

compute_hash()

Produces a hash unique to the pass configuration. Any new fields that affect compilation should be added to the hash. Any future fields that don't affect compilation should be excluded.

Source code in vllm/config/kernel.py
def compute_hash(self) -> str:
    """
    Produces a hash unique to the pass configuration.
    Any new fields that affect compilation should be added to the hash.
    Any future fields that don't affect compilation should be excluded.
    """
    ignored_factors = {
        "enable_flashinfer_autotune",
        "ir_op_priority",  # handled separately below
    }
    factors = get_hash_factors(self, ignored_factors)
    factors["ir_op_priority"] = self.ir_op_priority.compute_hash()
    return hash_factors(factors)

set_platform_defaults(vllm_config)

Set platform-specific defaults for the kernel config.

Source code in vllm/config/kernel.py
def set_platform_defaults(self, vllm_config: "VllmConfig") -> None:
    """Set platform-specific defaults for the kernel config."""
    from vllm.platforms import current_platform

    platform_op_priority = current_platform.get_default_ir_op_priority(vllm_config)
    logger.debug(
        "Setting platform-specific IR op priority defaults: %s, user-defined: %s",
        platform_op_priority,
        self.ir_op_priority,
    )
    for op_name, op_priority in asdict(platform_op_priority).items():
        current_op_priority: list[str] = getattr(self.ir_op_priority, op_name)
        if current_op_priority is None:
            setattr(self.ir_op_priority, op_name, op_priority)
        else:
            # Append platform-specific priorities
            # Must be idempotent because vllm_config.set_platform_defaults() may be
            # called multiple times (due to VllmConfig.__post_init__ manual call).
            unique_op_priority = [
                op for op in op_priority if op not in current_op_priority
            ]
            current_op_priority.extend(unique_op_priority)

    logger.info(
        "Final IR op priority after setting platform defaults: %s",
        self.ir_op_priority,
    )

LoRAConfig

Configuration for LoRA.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

Source code in vllm/config/lora.py
@config(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
    """Configuration for LoRA."""

    max_lora_rank: MaxLoRARanks = 16
    """Max LoRA rank."""
    max_loras: int = Field(default=1, ge=1)
    """Max number of LoRAs in a single batch."""
    fully_sharded_loras: bool = False
    """By default, only half of the LoRA computation is sharded with tensor
    parallelism. Enabling this will use the fully sharded layers. At high
    sequence length, max rank or tensor parallel size, this is likely faster.
    """
    max_cpu_loras: int | None = None
    """Maximum number of LoRAs to store in CPU memory. Must be >= than
    `max_loras`."""
    lora_dtype: torch.dtype | LoRADType = "auto"
    """Data type for LoRA. If auto, will default to base model dtype."""
    target_modules: list[str] | None = None
    """Restrict LoRA to specific module suffixes (e.g., ["o_proj", "qkv_proj"]).
    If None, all supported LoRA modules are used. This allows deployment-time
    control over which modules have LoRA applied, useful for performance tuning."""
    default_mm_loras: dict[str, str] | None = None
    """Dictionary mapping specific modalities to LoRA model paths; this field
    is only applicable to multimodal models and should be leveraged when a
    model always expects a LoRA to be active when a given modality is present.
    Note that currently, if a request provides multiple additional
    modalities, each of which have their own LoRA, we do NOT apply
    default_mm_loras because we currently only support one lora adapter
    per prompt. When run in offline mode, the lora IDs for n modalities
    will be automatically assigned to 1-n with the names of the modalities
    in alphabetic order."""
    enable_tower_connector_lora: bool = False
    """If `True`, LoRA support for the tower (vision encoder) and connector 
    of multimodal models will be enabled. This is an experimental feature and 
    currently only supports some MM models such as the Qwen VL series. The default 
    is False."""
    specialize_active_lora: bool = False
    """Whether to construct lora kernel grid by the number of active LoRA adapters.
    When set to True, separate cuda graphs will be captured for different counts
    of active LoRAs (powers of 2 up to max_loras), which can improve performance
    for variable LoRA usage patterns at the cost of increased startup time and
    memory usage. Only takes effect when cudagraph_specialize_lora is True.
    """
    enable_mixed_moe_lora_format: bool = False
    """If True, force the engine to use the universal 2D MoE LoRA wrapper
    (`FusedMoEWithLoRA`) regardless of the model's `is_3d_moe_weight` flag, so
    that 2D-format and 3D-format MoE LoRA adapters can be served in the same
    deployment. Only meaningful forMoE models; ignored otherwise. Default False 
    keeps the existing model-driven behavior."""

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        factors: list[Any] = []
        factors.append(self.max_lora_rank)
        factors.append(self.max_loras)
        factors.append(self.fully_sharded_loras)
        factors.append(self.lora_dtype)
        factors.append(self.enable_tower_connector_lora)
        factors.append(self.enable_mixed_moe_lora_format)
        # target_modules affects which modules get LoRA applied
        factors.append(
            tuple(sorted(self.target_modules)) if self.target_modules else None
        )

        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @model_validator(mode="after")
    def _validate_lora_config(self) -> Self:
        if self.max_cpu_loras is None:
            self.max_cpu_loras = self.max_loras
        elif self.max_cpu_loras < self.max_loras:
            raise ValueError(
                f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
                f"max_loras ({self.max_loras})."
            )
        if envs.VLLM_LORA_ENABLE_DUAL_STREAM and not current_platform.is_cuda_alike():
            raise ValueError("Dual CUDA streams are only supported on CUDA platforms.")
        if envs.VLLM_LORA_ENABLE_DUAL_STREAM and self.fully_sharded_loras:
            logger.warning_once(
                "fully_sharded_loras isn't compatible with "
                "VLLM_LORA_ENABLE_DUAL_STREAM, set VLLM_LORA_ENABLE_DUAL_STREAM=False"
            )
            envs.VLLM_LORA_ENABLE_DUAL_STREAM = False
        return self

    def verify_with_model_config(self, model_config: ModelConfig):
        if self.lora_dtype in (None, "auto"):
            self.lora_dtype = model_config.dtype
        elif isinstance(self.lora_dtype, str):
            self.lora_dtype = getattr(torch, self.lora_dtype)

default_mm_loras = None class-attribute instance-attribute

Dictionary mapping specific modalities to LoRA model paths; this field is only applicable to multimodal models and should be leveraged when a model always expects a LoRA to be active when a given modality is present. Note that currently, if a request provides multiple additional modalities, each of which have their own LoRA, we do NOT apply default_mm_loras because we currently only support one lora adapter per prompt. When run in offline mode, the lora IDs for n modalities will be automatically assigned to 1-n with the names of the modalities in alphabetic order.

enable_mixed_moe_lora_format = False class-attribute instance-attribute

If True, force the engine to use the universal 2D MoE LoRA wrapper (FusedMoEWithLoRA) regardless of the model's is_3d_moe_weight flag, so that 2D-format and 3D-format MoE LoRA adapters can be served in the same deployment. Only meaningful forMoE models; ignored otherwise. Default False keeps the existing model-driven behavior.

enable_tower_connector_lora = False class-attribute instance-attribute

If True, LoRA support for the tower (vision encoder) and connector of multimodal models will be enabled. This is an experimental feature and currently only supports some MM models such as the Qwen VL series. The default is False.

fully_sharded_loras = False class-attribute instance-attribute

By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster.

lora_dtype = 'auto' class-attribute instance-attribute

Data type for LoRA. If auto, will default to base model dtype.

max_cpu_loras = None class-attribute instance-attribute

Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras.

max_lora_rank = 16 class-attribute instance-attribute

Max LoRA rank.

max_loras = Field(default=1, ge=1) class-attribute instance-attribute

Max number of LoRAs in a single batch.

specialize_active_lora = False class-attribute instance-attribute

Whether to construct lora kernel grid by the number of active LoRA adapters. When set to True, separate cuda graphs will be captured for different counts of active LoRAs (powers of 2 up to max_loras), which can improve performance for variable LoRA usage patterns at the cost of increased startup time and memory usage. Only takes effect when cudagraph_specialize_lora is True.

target_modules = None class-attribute instance-attribute

Restrict LoRA to specific module suffixes (e.g., ["o_proj", "qkv_proj"]). If None, all supported LoRA modules are used. This allows deployment-time control over which modules have LoRA applied, useful for performance tuning.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/lora.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    factors: list[Any] = []
    factors.append(self.max_lora_rank)
    factors.append(self.max_loras)
    factors.append(self.fully_sharded_loras)
    factors.append(self.lora_dtype)
    factors.append(self.enable_tower_connector_lora)
    factors.append(self.enable_mixed_moe_lora_format)
    # target_modules affects which modules get LoRA applied
    factors.append(
        tuple(sorted(self.target_modules)) if self.target_modules else None
    )

    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

LoadConfig

Configuration for loading the model weights.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

Source code in vllm/config/load.py
@config
class LoadConfig:
    """Configuration for loading the model weights."""

    load_format: str | LoadFormats = "auto"
    """
    The format of the model weights to load.

    - "auto" will try to load the weights in the safetensors format and fall
      back to the pytorch bin format if safetensors format is not available.
    - "pt" will load the weights in the pytorch bin format.
    - "safetensors" will load the weights in the safetensors format.
    - "instanttensor" will load the Safetensors weights on CUDA devices using
      InstantTensor, which enables distributed loading with pipelined prefetching
      and fast direct I/O.
    - "npcache" will load the weights in pytorch format and store a numpy cache
      to speed up the loading.
    - "dummy" will initialize the weights with random values, which is mainly
      for profiling.
    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
      loading. See the Tensorize vLLM Model script in the Examples section for
      more information.
    - "runai_streamer" will load the Safetensors weights using Run:ai Model
      Streamer.
    - "runai_streamer_sharded" will load weights from pre-sharded checkpoint
      files using Run:ai Model Streamer.
    - "bitsandbytes" will load the weights using bitsandbytes quantization.
    - "sharded_state" will load weights from pre-sharded checkpoint files,
      supporting efficient loading of tensor-parallel models.
    - "mistral" will load weights from consolidated safetensors files used by
      Mistral models.
    - "modelexpress" will load weights using ModelExpress.
    - Other custom values can be supported via plugins.
    """
    download_dir: str | None = None
    """Directory to download and load the weights, default to the default
    cache directory of Hugging Face."""
    safetensors_load_strategy: SafetensorsLoadStrategy | None = None
    """
    Specifies the loading strategy for safetensors weights.

    - None (default): Uses memory-mapped (lazy) loading. When an NFS
      filesystem is detected and the total checkpoint size fits within 90%%
      of available RAM, prefetching is enabled automatically.
    - "lazy": Weights are memory-mapped from the file. This enables
      on-demand loading and is highly efficient for models on local storage.
      Unlike the default (None), auto-prefetch on NFS is not performed.
    - "eager": The entire file is read into CPU memory upfront before loading.
      This is recommended for models on network filesystems (e.g., Lustre, NFS)
      as it avoids inefficient random reads, significantly speeding up model
      initialization. However, it uses more CPU RAM.
    - "prefetch": Checkpoint files are read into the OS page cache before
      workers load them, speeding up the model loading phase. Useful on
      network or high-latency storage.
    - "torchao": Weights are loaded in upfront and then reconstructed
      into torchao tensor subclasses. This is used when the checkpoint
      was quantized using torchao and saved using safetensors.
      Needs `torchao >= 0.14.0`.
    """
    safetensors_prefetch_num_threads: int = Field(
        default=DEFAULT_SAFETENSORS_PREFETCH_NUM_THREADS, ge=1
    )
    """Number of worker threads used to prefetch safetensors checkpoint files
    into the OS page cache when safetensors prefetching is enabled."""
    safetensors_prefetch_block_size: int = Field(
        default=DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE, ge=1
    )
    """Read size in bytes for each safetensors checkpoint file prefetch."""
    model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
    """Extra config for model loader. This will be passed to the model loader
    corresponding to the chosen load_format."""
    device: str | None = None
    """Device to which model weights will be loaded, default to
    device_config.device"""
    ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
    """The list of patterns to ignore when loading the model. Default to
    "original/**/*" to avoid repeated loading of llama's checkpoints."""
    use_tqdm_on_load: bool = True
    """Whether to enable tqdm for showing progress bar when loading model
    weights."""
    pt_load_map_location: str | dict[str, str] = "cpu"
    """
    The map location for loading pytorch checkpoint, to support loading
    checkpoints can only be loaded on certain devices like "cuda", this
    is equivalent to `{"": "cuda"}`. Another supported format is mapping
    from different devices like from GPU 1 to GPU 0: `{"cuda:1": "cuda:0"}`.
    Note that when passed from command line, the strings in dictionary
    need to be double quoted for json parsing. For more details, see
    the original doc for `map_location` parameter in [`torch.load`][] parameter.
    """

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @field_validator("load_format", mode="after")
    def _lowercase_load_format(cls, load_format: str) -> str:
        return load_format.lower()

    @field_validator("ignore_patterns", mode="after")
    def _validate_ignore_patterns(
        cls, ignore_patterns: list[str] | str
    ) -> list[str] | str:
        if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
            logger.info(
                "Ignoring the following patterns when downloading weights: %s",
                ignore_patterns,
            )

        return ignore_patterns

device = None class-attribute instance-attribute

Device to which model weights will be loaded, default to device_config.device

download_dir = None class-attribute instance-attribute

Directory to download and load the weights, default to the default cache directory of Hugging Face.

ignore_patterns = Field(default_factory=(lambda: ['original/**/*'])) class-attribute instance-attribute

The list of patterns to ignore when loading the model. Default to "original/*/" to avoid repeated loading of llama's checkpoints.

load_format = 'auto' class-attribute instance-attribute

The format of the model weights to load.

  • "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
  • "pt" will load the weights in the pytorch bin format.
  • "safetensors" will load the weights in the safetensors format.
  • "instanttensor" will load the Safetensors weights on CUDA devices using InstantTensor, which enables distributed loading with pipelined prefetching and fast direct I/O.
  • "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
  • "dummy" will initialize the weights with random values, which is mainly for profiling.
  • "tensorizer" will use CoreWeave's tensorizer library for fast weight loading. See the Tensorize vLLM Model script in the Examples section for more information.
  • "runai_streamer" will load the Safetensors weights using Run:ai Model Streamer.
  • "runai_streamer_sharded" will load weights from pre-sharded checkpoint files using Run:ai Model Streamer.
  • "bitsandbytes" will load the weights using bitsandbytes quantization.
  • "sharded_state" will load weights from pre-sharded checkpoint files, supporting efficient loading of tensor-parallel models.
  • "mistral" will load weights from consolidated safetensors files used by Mistral models.
  • "modelexpress" will load weights using ModelExpress.
  • Other custom values can be supported via plugins.

model_loader_extra_config = Field(default_factory=dict) class-attribute instance-attribute

Extra config for model loader. This will be passed to the model loader corresponding to the chosen load_format.

pt_load_map_location = 'cpu' class-attribute instance-attribute

The map location for loading pytorch checkpoint, to support loading checkpoints can only be loaded on certain devices like "cuda", this is equivalent to {"": "cuda"}. Another supported format is mapping from different devices like from GPU 1 to GPU 0: {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings in dictionary need to be double quoted for json parsing. For more details, see the original doc for map_location parameter in torch.load parameter.

safetensors_load_strategy = None class-attribute instance-attribute

Specifies the loading strategy for safetensors weights.

  • None (default): Uses memory-mapped (lazy) loading. When an NFS filesystem is detected and the total checkpoint size fits within 90%% of available RAM, prefetching is enabled automatically.
  • "lazy": Weights are memory-mapped from the file. This enables on-demand loading and is highly efficient for models on local storage. Unlike the default (None), auto-prefetch on NFS is not performed.
  • "eager": The entire file is read into CPU memory upfront before loading. This is recommended for models on network filesystems (e.g., Lustre, NFS) as it avoids inefficient random reads, significantly speeding up model initialization. However, it uses more CPU RAM.
  • "prefetch": Checkpoint files are read into the OS page cache before workers load them, speeding up the model loading phase. Useful on network or high-latency storage.
  • "torchao": Weights are loaded in upfront and then reconstructed into torchao tensor subclasses. This is used when the checkpoint was quantized using torchao and saved using safetensors. Needs torchao >= 0.14.0.

safetensors_prefetch_block_size = Field(default=DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE, ge=1) class-attribute instance-attribute

Read size in bytes for each safetensors checkpoint file prefetch.

safetensors_prefetch_num_threads = Field(default=DEFAULT_SAFETENSORS_PREFETCH_NUM_THREADS, ge=1) class-attribute instance-attribute

Number of worker threads used to prefetch safetensors checkpoint files into the OS page cache when safetensors prefetching is enabled.

use_tqdm_on_load = True class-attribute instance-attribute

Whether to enable tqdm for showing progress bar when loading model weights.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/load.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

MambaConfig

Configuration for Mamba SSM backends.

Methods:

Attributes:

Source code in vllm/config/mamba.py
@config
class MambaConfig:
    """Configuration for Mamba SSM backends."""

    backend: MambaBackendEnum = MambaBackendEnum.TRITON
    """Mamba SSU backend to use."""

    enable_stochastic_rounding: bool = False
    """Enable stochastic rounding when writing SSM state to fp16 cache.
    Uses random bits to unbias the rounding error, which can improve
    numerical stability for long sequences."""
    stochastic_rounding_philox_rounds: int = 0
    """Number of Philox PRNG rounds for stochastic rounding random number
    generation. 0 uses the Triton default. Higher values improve randomness
    quality at the cost of compute."""

    @field_validator("backend", mode="before")
    @classmethod
    def validate_backend_before(cls, value: Any) -> Any:
        """Enable parsing of the `backend` enum type from string."""
        if isinstance(value, str):
            return MambaBackendEnum[value.upper()]
        return value

    def __post_init__(self):
        if self.enable_stochastic_rounding:
            from vllm.platforms import current_platform

            if not current_platform.is_cuda():
                raise ValueError(
                    "Stochastic rounding for Mamba cache is only supported "
                    "on NVIDIA CUDA platforms. Please do not specify  "
                    "`--enable-mamba-cache-stochastic-rounding`."
                )
            if (
                self.backend == MambaBackendEnum.TRITON
                and not current_platform.is_device_capability_family(100)
            ):
                raise ValueError(
                    "Stochastic rounding for Mamba cache with triton backend requires "
                    "compute capability 10.0 (data center Blackwell). The `cvt.rs` "
                    "PTX instruction is not supported on your GPU. Please do not "
                    "specify `--enable-mamba-cache-stochastic-rounding`, "
                    "or set `--mamba-backend flashinfer`."
                )

backend = MambaBackendEnum.TRITON class-attribute instance-attribute

Mamba SSU backend to use.

enable_stochastic_rounding = False class-attribute instance-attribute

Enable stochastic rounding when writing SSM state to fp16 cache. Uses random bits to unbias the rounding error, which can improve numerical stability for long sequences.

stochastic_rounding_philox_rounds = 0 class-attribute instance-attribute

Number of Philox PRNG rounds for stochastic rounding random number generation. 0 uses the Triton default. Higher values improve randomness quality at the cost of compute.

validate_backend_before(value) classmethod

Enable parsing of the backend enum type from string.

Source code in vllm/config/mamba.py
@field_validator("backend", mode="before")
@classmethod
def validate_backend_before(cls, value: Any) -> Any:
    """Enable parsing of the `backend` enum type from string."""
    if isinstance(value, str):
        return MambaBackendEnum[value.upper()]
    return value

ModelConfig

Configuration for the model.

Methods:

Attributes:

Source code in vllm/config/model.py
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
@config(config=ConfigDict(arbitrary_types_allowed=True))
class ModelConfig:
    """Configuration for the model."""

    model: str = "Qwen/Qwen3-0.6B"
    """Name or path of the Hugging Face model to use. It is also used as the
    content for `model_name` tag in metrics output when `served_model_name` is
    not specified."""
    model_weights: str = ""
    """Original model weights path. Used when the model is pulled from object
    storage (e.g., RunAI) to preserve the original URI while `model` points to
    the local directory."""
    runner: RunnerOption = "auto"
    """The type of model runner to use. Each vLLM instance only supports one
    model runner, even if the same model can be used for multiple types."""
    convert: ConvertOption = "auto"
    """Convert the model using adapters defined in
    [vllm.model_executor.models.adapters][]. The most common use case is to
    adapt a text generation model to be used for pooling tasks."""
    tokenizer: str = None  # type: ignore[assignment]
    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
    name or path will be used."""
    tokenizer_mode: TokenizerMode | str = "auto"
    """Tokenizer mode:

    - "auto" will use the tokenizer from `mistral_common` for Mistral models
      if available, otherwise it will use the "hf" tokenizer.
    - "hf" will use the fast tokenizer if available.
    - "slow" will always use the slow tokenizer.
    - "mistral" will always use the tokenizer from `mistral_common`.
    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
    - "deepseek_v4" will always use the tokenizer from `deepseek_v4`.
    - Other custom values can be supported via plugins.

    To swap the Rust BPE backend that powers HF fast tokenizers for the
    [fastokens](https://github.com/crusoecloud/fastokens) implementation, set
    `VLLM_USE_FASTOKENS=1` instead — that override applies to any mode that
    loads an HF fast tokenizer (`hf`, `deepseek_v32`, `deepseek_v4`, …)."""
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
    and tokenizer."""
    dtype: ModelDType | torch.dtype = "auto"
    """Data type for model weights and activations:

    - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
      precision for BF16 models.
    - "half" for FP16. Recommended for AWQ quantization.
    - "float16" is the same as "half".
    - "bfloat16" for a balance between precision and range.
    - "float" is shorthand for FP32 precision.
    - "float32" for FP32 precision."""
    seed: int = 0
    """Random seed for reproducibility.

    We must set the global seed because otherwise,
    different tensor parallel workers would sample different tokens,
    leading to inconsistent results."""
    hf_config: PretrainedConfig = field(init=False)
    """The Hugging Face config of the model."""
    hf_text_config: PretrainedConfig = field(init=False)
    """The Hugging Face config of the text model (same as hf_config for text models)."""
    hf_config_path: str | None = None
    """Name or path of the Hugging Face config to use. If unspecified, model
    name or path will be used."""
    allowed_local_media_path: str = ""
    """Allowing API requests to read local images or videos from directories
    specified by the server file system. This is a security risk. Should only
    be enabled in trusted environments."""
    allowed_media_domains: list[str] | None = None
    """If set, only media URLs that belong to this domain can be used for
    multi-modal inputs. """
    revision: str | None = None
    """The specific model version to use. It can be a branch name, a tag name,
    or a commit id. If unspecified, will use the default version."""
    code_revision: str | None = None
    """The specific revision to use for the model code on the Hugging Face Hub.
    It can be a branch name, a tag name, or a commit id. If unspecified, will
    use the default version."""
    tokenizer_revision: str | None = None
    """The specific revision to use for the tokenizer on the Hugging Face Hub.
    It can be a branch name, a tag name, or a commit id. If unspecified, will
    use the default version."""
    max_model_len: int = Field(default=None, ge=-1)  # type: ignore[assignment]
    """Model context length (prompt and output). If unspecified, will be
    automatically derived from the model config.

    When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
    format. Examples:

    - 1k -> 1000
    - 1K -> 1024
    - 25.6k -> 25,600
    - -1 or 'auto' -> Automatically choose the maximum model length that fits in
      GPU memory. This will use the model's maximum context length if it fits,
      otherwise it will find the largest length that can be accommodated."""
    spec_target_max_model_len: int | None = None
    """Specify the maximum length for spec decoding draft models."""
    quantization: QuantizationMethods | str | None = None
    """Method used to quantize the weights. If `None`, we first check the
    `quantization_config` attribute in the model config file. If that is
    `None`, we assume the model weights are not quantized and use `dtype` to
    determine the data type of the weights."""
    quantization_config: dict[str, Any] | QuantizationConfigArgs | None = None
    """User-facing quantization configuration. Carries per-layer-kind specs
    (linear, moe) and ignore patterns; see :class:`QuantizationConfigArgs`.
    Auto-populated from the matching online shorthand when `quantization` is
    one of the values in `ONLINE_QUANT_SHORTHAND_NAMES`."""
    allow_deprecated_quantization: bool = False
    """Whether to allow deprecated quantization methods."""
    enforce_eager: bool = False
    """Whether to always use eager-mode PyTorch. If True, we will disable CUDA
    graph and always execute the model in eager mode. If False, we will use
    CUDA graph and eager execution in hybrid for maximal performance and
    flexibility."""
    enable_return_routed_experts: bool = False
    """Whether to return routed experts."""
    max_logprobs: int = Field(default=20, ge=-1)
    """Maximum number of log probabilities to return when `logprobs` is
    specified in `SamplingParams`. The default value comes the default for the
    OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length *
    vocab_size) logprobs are allowed to be returned and it may cause OOM."""
    logprobs_mode: LogprobsMode = "raw_logprobs"
    """Indicates the content returned in the logprobs and prompt_logprobs.
    Supported mode:
    1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits.
    Raw means the values before applying any logit processors, like bad words.
    Processed means the values after applying all processors, including
    temperature and top_k/top_p.
    """
    use_fp64_gumbel: bool = False
    """Whether to use FP64 (instead of FP32) random noise for Gumbel-max and
    equivalent exponential-race sampling. FP64 preserves lower-tail sampling
    events that fp32 uniform/exponential draws can truncate, at the cost of
    significantly lower throughput on most GPUs."""
    disable_sliding_window: bool = False
    """Whether to disable sliding window. If True, we will disable the sliding
    window functionality of the model, capping to sliding window size. If the
    model does not support sliding window, this argument is ignored."""
    disable_cascade_attn: bool = True
    """Disable cascade attention for V1. While cascade attention does not
    change the mathematical correctness, disabling it could be useful for
    preventing potential numerical issues. This defaults to True, so users
    must opt in to cascade attention by setting this to False. Even when this
    is set to False, cascade attention will only be used when the heuristic
    tells that it's beneficial."""
    skip_tokenizer_init: bool = False
    """Skip initialization of tokenizer and detokenizer. Expects valid
    `prompt_token_ids` and `None` for prompt from the input. The generated
    output will contain token ids."""
    enable_prompt_embeds: bool = False
    """If `True`, enables passing text embeddings as inputs via the
    `prompt_embeds` key.

    WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
    Only enable this flag for trusted users!"""
    served_model_name: str | list[str] | None = None
    """The model name(s) used in the API. If multiple names are provided, the
    server will respond to any of the provided names. The model name in the
    model field of a response will be the first name in this list. If not
    specified, the model name will be the same as the `--model` argument. Noted
    that this name(s) will also be used in `model_name` tag content of
    prometheus metrics, if multiple names provided, metrics tag will take the
    first one."""
    config_format: str | ConfigFormat = "auto"
    """The format of the model config to load:

    - "auto" will try to load the config in hf format if available after trying
      to load in mistral format.
    - "hf" will load the config in hf format.
    - "mistral" will load the config in mistral format."""
    hf_token: bool | str | None = None
    """The token to use as HTTP bearer authorization for remote files . If
    `True`, will use the token generated when running `hf auth login`
    (stored in `~/.cache/huggingface/token`)."""
    hf_overrides: HfOverrides = field(default_factory=dict)
    """If a dictionary, contains arguments to be forwarded to the Hugging Face
    config. If a callable, it is called to update the HuggingFace config."""
    generation_config: str = "auto"
    """The folder path to the generation config. Defaults to `"auto"`, the
    generation config will be loaded from model path. If set to `"vllm"`, no
    generation config is loaded, vLLM defaults will be used. If set to a folder
    path, the generation config will be loaded from the specified folder path.
    If `max_new_tokens` is specified in generation config, then it sets a
    server-wide limit on the number of output tokens for all requests."""
    override_generation_config: dict[str, Any] = field(default_factory=dict)
    """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If
    used with `--generation-config auto`, the override parameters will be
    merged with the default config from the model. If used with
    `--generation-config vllm`, only the override parameters are used."""
    enable_sleep_mode: bool = False
    """Enable sleep mode for the engine (only cuda and
    hip platforms are supported)."""
    enable_cumem_allocator: bool = False
    """Enable the custom cumem allocator to leverage advanced GPU memory
    allocation features such as multi-node NVLink support.

    Sleep mode automatically enables this allocator. Only cuda and hip
    platforms are supported.
    """
    model_impl: str | ModelImpl = "auto"
    """Which implementation of the model to use:

    - "auto" will try to use the vLLM implementation, if it exists, and fall back to the
      Transformers implementation if no vLLM implementation is available.
    - "vllm" will use the vLLM model implementation.
    - "transformers" will use the Transformers model implementation.
    - "terratorch" will use the TerraTorch model implementation.
    """
    override_attention_dtype: str | None = None
    """Override dtype for attention"""
    logits_processors: list[str | type[LogitsProcessor]] | None = None
    """One or more logits processors' fully-qualified class names or class
    definitions"""
    io_processor_plugin: str | None = None
    """IOProcessor plugin name to load at model startup"""
    renderer_num_workers: int = 1
    """Number of worker threads in the renderer thread pool. The pool is
    consumed by the async renderer path (e.g. the OpenAI-compatible API
    server started by `vllm serve`) to parallelize tokenization, chat
    template rendering, and multimodal preprocessing across concurrent
    requests.

    The offline `LLM` entrypoint uses the synchronous renderer path and
    processes prompts (including multimodal preprocessing) serially, so
    this setting has no effect there."""

    # Pooler config
    pooler_config: PoolerConfig | None = None
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""

    # Multimodal config and init vars
    multimodal_config: MultiModalConfig | None = None
    """Configuration for multimodal model. If `None`, this will be inferred
    from the architecture of `self.model`."""
    language_model_only: InitVar[bool] = False
    limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
    enable_mm_embeds: InitVar[bool | None] = None
    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
    mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
    mm_processor_cache_gb: InitVar[float | None] = None
    mm_processor_cache_type: InitVar[MMCacheType | None] = None
    mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
    mm_encoder_only: InitVar[bool | None] = None
    mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
    mm_encoder_attn_backend: InitVar[AttentionBackendEnum | str | None] = None
    mm_encoder_attn_dtype: InitVar[str | None] = None
    mm_encoder_fp8_scale_path: InitVar[str | None] = None
    mm_encoder_fp8_scale_save_path: InitVar[str | None] = None
    mm_encoder_fp8_scale_save_margin: InitVar[float | None] = None
    interleave_mm_strings: InitVar[bool | None] = None
    skip_mm_profiling: InitVar[bool | None] = None
    video_pruning_rate: InitVar[float | None] = None
    mm_tensor_ipc: InitVar[MMTensorIPC] = None
    mm_ipc_gpu_memory_gb: InitVar[float | None] = None

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        ignored_factors = {
            "convert",
            "tokenizer",
            "tokenizer_mode",
            "seed",
            "hf_config_path",
            "allowed_local_media_path",
            "allowed_media_domains",
            "tokenizer_revision",
            "spec_target_max_model_len",
            "enforce_eager",
            "logprobs_mode",
            "use_fp64_gumbel",
            "disable_cascade_attn",
            "skip_tokenizer_init",
            "served_model_name",
            "config_format",
            "hf_token",
            "hf_overrides",
            "override_attention_dtype",
            "logits_processors",
            "io_processor_plugin",
            "pooler_config",
            "multimodal_config",
            "limit_mm_per_prompt",
            "media_io_kwargs",
            "mm_processor_kwargs",
            "mm_processor_cache_gb",
            "mm_processor_cache_type",
            "mm_shm_cache_max_object_size_mb",
            "mm_encoder_tp_mode",
            "interleave_mm_strings",
            "skip_mm_profiling",
            "mm_ipc_gpu_memory_gb",
        }

        from vllm.config.utils import get_hash_factors, hash_factors

        factors = get_hash_factors(self, ignored_factors)

        # NOTE: For some models (e.g, Qwen3-VL), whether the MM code path is enabled
        # affects the computation graph of the language model, therefore we add it
        # here early.
        if self.multimodal_config:
            factors["language_model_only"] = self.multimodal_config.language_model_only
        return hash_factors(factors)

    def _update_nested(
        self,
        target: PretrainedConfig | dict[str, Any],
        updates: dict[str, Any],
    ) -> None:
        """Recursively updates a config or dict with nested updates."""
        for key, value in updates.items():
            if isinstance(value, dict):
                # Get the nested target
                if isinstance(target, dict):
                    nested_target = target.get(key)
                else:
                    nested_target = getattr(target, key, None)

                # If nested target exists and can be updated recursively
                if nested_target is not None and (
                    isinstance(nested_target, dict)
                    or hasattr(nested_target, "__dict__")
                ):
                    self._update_nested(nested_target, value)
                    continue

            # Set the value (base case)
            if isinstance(target, dict):
                target[key] = value
            else:
                setattr(target, key, value)

    def _apply_dict_overrides(
        self,
        config: PretrainedConfig,
        overrides: dict[str, Any],
    ) -> None:
        """Apply dict overrides, handling both nested configs and dict values."""
        from transformers import PretrainedConfig

        for key, value in overrides.items():
            attr = getattr(config, key, None)
            if attr is not None and isinstance(attr, PretrainedConfig):
                # It's a nested config - recursively update it
                self._update_nested(attr, value)
            else:
                # It's a dict-valued parameter - set it directly
                setattr(config, key, value)

    def __post_init__(
        self,
        # Multimodal config init vars
        language_model_only: bool,
        limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
        enable_mm_embeds: bool | None,
        media_io_kwargs: dict[str, dict[str, Any]] | None,
        mm_processor_kwargs: dict[str, Any] | None,
        mm_processor_cache_gb: float | None,
        mm_processor_cache_type: MMCacheType | None,
        mm_shm_cache_max_object_size_mb: int | None,
        mm_encoder_only: bool | None,
        mm_encoder_tp_mode: MMEncoderTPMode | None,
        mm_encoder_attn_backend: AttentionBackendEnum | str | None,
        mm_encoder_attn_dtype: str | None,
        mm_encoder_fp8_scale_path: str | None,
        mm_encoder_fp8_scale_save_path: str | None,
        mm_encoder_fp8_scale_save_margin: float | None,
        interleave_mm_strings: bool | None,
        skip_mm_profiling: bool | None,
        video_pruning_rate: float | None,
        mm_tensor_ipc: MMTensorIPC,
        mm_ipc_gpu_memory_gb: float | None,
    ) -> None:
        # Keep set served_model_name before maybe_model_redirect(self.model)
        self.served_model_name = get_served_model_name(
            self.model, self.served_model_name
        )
        self.model = maybe_model_redirect(self.model)
        # The tokenizer is consistent with the model by default.
        if self.tokenizer is None:
            self.tokenizer = self.model
        if self.tokenizer_revision is None:
            self.tokenizer_revision = self.revision
        self.tokenizer = maybe_model_redirect(self.tokenizer)

        if isinstance(self.hf_config_path, str):
            self.hf_config_path = maybe_model_redirect(self.hf_config_path)

        if callable(self.hf_overrides):
            hf_overrides_kw: dict[str, Any] = {}
            hf_overrides_fn = self.hf_overrides
            dict_overrides: dict[str, Any] = {}
        else:
            # Separate dict overrides from flat ones
            # We'll determine how to apply dict overrides after loading the config
            hf_overrides_kw = {}
            dict_overrides = {}
            for key, value in self.hf_overrides.items():
                if isinstance(value, dict):
                    dict_overrides[key] = value
                else:
                    hf_overrides_kw[key] = value
            hf_overrides_fn = None

        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)

        if self.override_attention_dtype is not None and not current_platform.is_rocm():
            warnings.warn(
                "override-attention-dtype is set but not using ROCm platform",
                stacklevel=2,
            )

        if self.enable_sleep_mode:
            if not current_platform.is_sleep_mode_available():
                raise ValueError("Sleep mode is not supported on current platform.")
            if current_platform.is_cuda_alike() and not self.enable_cumem_allocator:
                logger.info_once(
                    "Enabling cumem allocator because sleep mode requires it."
                )
                self.enable_cumem_allocator = True
        if (
            self.enable_cumem_allocator
            and not current_platform.is_cumem_allocator_available()
        ):
            raise ValueError("cumem allocator is not supported on current platform.")

        hf_config = get_config(
            self.hf_config_path or self.model,
            self.trust_remote_code,
            self.revision,
            self.code_revision,
            self.config_format,
            hf_overrides_kw=hf_overrides_kw,
            hf_overrides_fn=hf_overrides_fn,
            token=self.hf_token,
        )
        self.hf_config = hf_config
        if dict_overrides:
            self._apply_dict_overrides(hf_config, dict_overrides)
        self.hf_text_config = get_hf_text_config(self.hf_config)
        self.model_arch_config = self.get_model_arch_config()
        self.attention_chunk_size = getattr(
            self.hf_text_config, "attention_chunk_size", None
        )
        self.encoder_config = self._get_encoder_config()
        self.hf_image_processor_config = get_hf_image_processor_config(
            self.model, hf_token=self.hf_token, revision=self.revision
        )

        architectures = self.architectures
        registry = self.registry
        is_generative_model = registry.is_text_generation_model(architectures, self)
        is_pooling_model = registry.is_pooling_model(architectures, self)

        self.runner_type = self._get_runner_type(
            architectures, self.runner, self.convert
        )
        self.convert_type = self._get_convert_type(
            architectures, self.runner_type, self.convert
        )

        if (
            is_pooling_model
            and not is_generative_model
            and self.runner_type in ("draft", "generate")
        ):
            raise ValueError(
                f"Embedding models do not support `--runner {self.runner_type}`. "
                "Use `--runner pooling` or `--runner auto` for embedding models."
            )
        if self.runner_type == "generate" and not is_generative_model:
            generate_converts = _RUNNER_CONVERTS["generate"]
            if self.convert_type not in generate_converts:
                # Currently we don't have any converters for generative models
                raise ValueError("This model does not support `--runner generate`.")
        if self.runner_type == "pooling" and not is_pooling_model:
            pooling_converts = _RUNNER_CONVERTS["pooling"]
            if self.convert_type not in pooling_converts:
                convert_option = "<" + "|".join(pooling_converts) + ">"
                raise ValueError(
                    "This model does not support `--runner pooling`. "
                    f"You can pass `--convert {convert_option} to adapt "
                    "it into a pooling model."
                )

        # Note: Initialize these attributes early because transformers fallback
        # may fail to load dynamic modules in child processes
        model_info, arch = registry.inspect_model_cls(architectures, self)
        self._model_info = model_info
        self._architecture = arch
        logger.info("Resolved architecture: %s", arch)

        # Set default tokenizer modes based on model architecture
        if self.tokenizer_mode == "auto":
            if self.model_impl == "terratorch":
                self.tokenizer_mode = "terratorch"
            elif arch == "MoonshotKimiaForCausalLM":
                self.tokenizer_mode = "kimi_audio"
            elif arch == "DeepseekV32ForCausalLM":
                self.tokenizer_mode = "deepseek_v32"
            elif arch == "DeepseekV4ForCausalLM":
                self.tokenizer_mode = "deepseek_v4"

            if self.tokenizer_mode != "auto":
                logger.info(
                    "Defaulting to tokenizer_mode=%r for %s",
                    self.tokenizer_mode,
                    arch,
                )

        # Init pooler config if needed
        if self.runner_type == "pooling":
            if self.pooler_config is None:
                self.pooler_config = PoolerConfig()

            base_config = get_pooling_config(self.model, self.revision)
            if base_config is not None:
                # Only set values that are not overridden by the user
                for k, v in base_config.items():
                    if getattr(self.pooler_config, k) is None:
                        setattr(self.pooler_config, k, v)

            default_seq_pooling_type = self._model_info.default_seq_pooling_type
            if self.pooler_config.seq_pooling_type is None:
                self.pooler_config.seq_pooling_type = default_seq_pooling_type
            default_tok_pooling_type = self._model_info.default_tok_pooling_type
            if self.pooler_config.tok_pooling_type is None:
                self.pooler_config.tok_pooling_type = default_tok_pooling_type

        self.dtype: torch.dtype = _get_and_verify_dtype(
            self.model,
            self.hf_config,
            self.dtype,
            is_pooling_model=self.runner_type == "pooling",
            revision=self.revision,
            config_format=self.config_format,
        )

        # Some checkpoints set sliding_window to 0 to indicate that sliding window is
        # disabled, but vLLM uses None for that. Convert 0 to None to avoid errors.
        # Set before get_and_verify_max_len to ensure that max_model_len does not get
        # capped to 0.
        if self.get_sliding_window() == 0:
            self.disable_sliding_window = True
            self.hf_text_config.sliding_window = None

        self.original_max_model_len = self.max_model_len
        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)

        if self.is_encoder_decoder:
            mm_processor_cache_gb = 0
            logger.info("Encoder-decoder model detected, disabling mm processor cache.")

        # Init multimodal config if needed
        if self._model_info.supports_multimodal:
            if (
                mm_encoder_tp_mode == "data"
                and not self._model_info.supports_multimodal_encoder_tp_data
            ):
                logger.warning_once(
                    "This model does not support `--mm-encoder-tp-mode data`. "
                    "Falling back to `--mm-encoder-tp-mode weights`."
                )
                mm_encoder_tp_mode = "weights"

            mm_config_kwargs = dict(
                language_model_only=language_model_only,
                limit_per_prompt=limit_mm_per_prompt,
                enable_mm_embeds=enable_mm_embeds,
                media_io_kwargs=media_io_kwargs,
                mm_processor_kwargs=mm_processor_kwargs,
                mm_processor_cache_gb=mm_processor_cache_gb,
                mm_processor_cache_type=mm_processor_cache_type,
                mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb,
                mm_encoder_only=mm_encoder_only,
                mm_encoder_tp_mode=mm_encoder_tp_mode,
                mm_encoder_attn_backend=mm_encoder_attn_backend,
                mm_encoder_attn_dtype=mm_encoder_attn_dtype,
                mm_encoder_fp8_scale_path=mm_encoder_fp8_scale_path,
                mm_encoder_fp8_scale_save_path=mm_encoder_fp8_scale_save_path,
                mm_encoder_fp8_scale_save_margin=mm_encoder_fp8_scale_save_margin,
                interleave_mm_strings=interleave_mm_strings,
                skip_mm_profiling=skip_mm_profiling,
                video_pruning_rate=video_pruning_rate,
                mm_tensor_ipc=mm_tensor_ipc,
                mm_ipc_gpu_memory_gb=mm_ipc_gpu_memory_gb,
            )

            mm_config_kwargs = {
                k: v for k, v in mm_config_kwargs.items() if v is not None
            }

            self.multimodal_config = MultiModalConfig(**mm_config_kwargs)  # type: ignore[arg-type]

            if (
                self.renderer_num_workers > 1
                and self.multimodal_config.mm_processor_cache_gb > 0
            ):
                raise ValueError(
                    "Cannot use --renderer-num-workers > 1 with the "
                    "multimodal processor cache enabled. The cache is "
                    "not thread-safe and does not support concurrent "
                    "renderer workers. Please set "
                    "--renderer-num-workers 1 (the default), or "
                    "disable the cache with --mm-processor-cache-gb 0."
                )

        if self.disable_sliding_window:
            # Set after get_and_verify_max_len to ensure that max_model_len
            # can be correctly capped to sliding window size
            self.hf_text_config.sliding_window = None

        # Avoid running try_verify_and_update_config multiple times
        self.config_updated = False
        self._try_verify_and_update_model_config()
        self._verify_quantization()
        self._verify_cuda_graph()
        self._verify_bnb_config()

    def get_model_arch_config(
        self,
    ) -> ModelArchitectureConfig:
        convertor_cls = MODEL_ARCH_CONFIG_CONVERTORS.get(
            self.hf_config.model_type, ModelArchConfigConvertorBase
        )
        convertor = convertor_cls(self.hf_config, self.hf_text_config)
        return convertor.convert()

    @field_validator("tokenizer", "max_model_len", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialisation is delayed."""
        if value is None:
            return value
        return handler(value)

    @field_validator("tokenizer_mode", mode="after")
    def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
        return tokenizer_mode.lower()

    @field_validator("quantization", mode="before")
    @classmethod
    def validate_quantization_before(cls, value: Any) -> Any:
        if isinstance(value, str):
            return value.lower()
        return value

    @model_validator(mode="after")
    def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
        """Called after __post_init__"""
        if not isinstance(self.tokenizer, str):
            raise ValueError(
                f"tokenizer must be a string, got "
                f"{type(self.tokenizer).__name__}: {self.tokenizer!r}. "
                "Please provide a valid tokenizer path or HuggingFace model ID."
            )
        if not isinstance(self.max_model_len, int) or self.max_model_len < 1:
            raise ValueError(
                f"max_model_len must be a positive integer, "
                f"got {type(self.max_model_len).__name__}: {self.max_model_len!r}. "
                "Example: max_model_len=2048"
            )
        return self

    def _get_transformers_backend_cls(self) -> str:
        """Determine which Transformers modeling backend class will be used if
        `model_impl` is set to `transformers` or `auto`."""
        cls = "Transformers"
        # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
        cls += "MultiModal" if self.hf_config != self.hf_text_config else ""
        cls += "MoE" if self.is_moe else ""
        # Check if the architecture we're wrapping has defaults
        runner = None
        task = None
        if defaults := try_match_architecture_defaults(self.architectures[0]):
            _, (runner, task) = defaults
        # User specified value take precedence
        if self.runner != "auto":
            runner = self.runner
        # Only consider Transformers modeling backend pooling classes if we're wrapping
        # an architecture that defaults to pooling. Otherwise, we return the LM class
        # and use adapters.
        if runner == "pooling" and task in {"embed", "classify"}:
            if task == "embed":
                cls += "EmbeddingModel"
            elif task == "classify":
                cls += "ForSequenceClassification"
        else:
            cls += "ForCausalLM"
        return cls

    def using_transformers_backend(self) -> bool:
        """Check if the model is using the Transformers modeling backend class."""
        used_cls = self._model_info.architecture
        transformers_backend_cls = self._get_transformers_backend_cls()
        return used_cls == transformers_backend_cls

    @property
    def registry(self):
        return me_models.ModelRegistry

    @property
    def architectures(self) -> list[str]:
        return self.model_arch_config.architectures

    @property
    def architecture(self) -> str:
        """The architecture vllm actually used."""
        return self._architecture

    def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
        """Pull model/tokenizer from Object Storage to temporary
        directory when needed.

        Args:
            model: Model name or path
            tokenizer: Tokenizer name or path
        """

        # Skip if model_weights is already set (model already pulled)
        if self.model_weights:
            return

        if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
            return

        if is_runai_obj_uri(model):
            object_storage_model = ObjectStorageModel(url=model)
            object_storage_model.pull_files(
                model, allow_pattern=["*.model", "*.py", "*.json"]
            )
            self.model_weights = model
            self.model = object_storage_model.dir

            # If tokenizer is same as model, download to same directory
            if model == tokenizer:
                object_storage_model.pull_files(
                    model,
                    ignore_pattern=[
                        "*.pt",
                        "*.safetensors",
                        "*.bin",
                        "*.tensors",
                        "*.pth",
                    ],
                )
                self.tokenizer = object_storage_model.dir
                return

        # Only download tokenizer if needed and not already handled
        if is_runai_obj_uri(tokenizer):
            object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
            object_storage_tokenizer.pull_files(
                tokenizer,
                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
            )
            self.tokenizer = object_storage_tokenizer.dir

    def _get_encoder_config(self) -> dict[str, Any] | None:
        return get_sentence_transformer_tokenizer_config(self.model, self.revision)

    def _get_default_runner_type(
        self,
        architectures: list[str],
    ) -> RunnerType:
        registry = self.registry

        # Some Sentence Transformers models use *ForCausalLM archs
        if get_pooling_config(self.model, self.revision):
            return "pooling"

        for arch in architectures:
            if arch in registry.get_supported_archs():
                if registry.is_pooling_model(architectures, self):
                    return "pooling"
                if registry.is_text_generation_model(architectures, self):
                    return "generate"

            match = try_match_architecture_defaults(arch)
            if match:
                _, (runner_type, _) = match
                return runner_type

        return "generate"

    def _get_runner_type(
        self,
        architectures: list[str],
        runner: RunnerOption,
        convert: ConvertOption,
    ) -> RunnerType:
        if runner != "auto":
            return runner

        if convert in {"auto", "none"}:
            runner_type = self._get_default_runner_type(architectures)
        else:
            runner_type = "pooling"

        # Don't log the most common case
        if runner_type != "generate":
            logger.info(
                "Resolved `--runner auto` to `--runner %s`. "
                "Pass the value explicitly to silence this message.",
                runner_type,
            )

        return runner_type

    def _get_default_convert_type(
        self,
        architectures: list[str],
        runner_type: RunnerType,
    ) -> ConvertType:
        registry = self.registry

        for arch in architectures:
            if arch in registry.get_supported_archs():
                if runner_type == "generate" and registry.is_text_generation_model(
                    architectures, self
                ):
                    return "none"
                if runner_type == "pooling" and registry.is_pooling_model(
                    architectures, self
                ):
                    return "none"

            match = try_match_architecture_defaults(arch, runner_type=runner_type)
            if match:
                _, (_, convert_type) = match
                return convert_type

        # This is to handle Sentence Transformers models that use *ForCausalLM
        # and also multi-modal pooling models which are not defined as
        # Sentence Transformers models
        if runner_type == "pooling":
            return "embed"

        return "none"

    def _get_convert_type(
        self,
        architectures: list[str],
        runner_type: RunnerType,
        convert: ConvertOption,
    ) -> ConvertType:
        if convert != "auto":
            return convert

        convert_type = self._get_default_convert_type(architectures, runner_type)

        # Don't log the most common case
        if convert_type != "none":
            logger.info(
                "Resolved `--convert auto` to `--convert %s`. "
                "Pass the value explicitly to silence this message.",
                convert_type,
            )

        return convert_type

    def _verify_quantization(self) -> None:
        supported_quantization = me_quant.QUANTIZATION_METHODS
        if self.quantization is not None:
            self.quantization = cast(me_quant.QuantizationMethods, self.quantization)

        # Parse quantization method from the HF model config, if available.
        quant_cfg = self.model_arch_config.quantization_config

        if quant_cfg is not None:
            quant_method = quant_cfg["quant_method"]
            # Quantization methods which are overrides (i.e. they have a
            # `override_quantization_method` method) must be checked in order
            # of preference (this is particularly important for GPTQ).
            overrides = [
                "auto_gptq",
                "gptq",
                "gptq_marlin",
                "auto_awq",
                "awq",
                "awq_marlin",
                "inc",
                "moe_wna16",
                "modelopt",
                "modelopt_fp4",
                "modelopt_mxfp8",
                "modelopt_mixed",
                # Ensure heavy backends are probed last to avoid unnecessary
                # imports during override detection (e.g., MXFP4 imports Triton)
                "mxfp4",
                "gpt_oss_mxfp4",
                "deepseek_v4_fp8",
                "humming",
            ]
            # if the user specifies humming, we should always use humming
            if self.quantization == "humming":
                overrides = ["humming"] + overrides
            quantization_methods = [
                q for q in supported_quantization if q not in overrides
            ]
            # Any custom overrides will be in quantization_methods so we place
            # them at the start of the list so custom overrides have preference
            # over the built-in ones.
            quantization_methods = quantization_methods + overrides

            # Detect which checkpoint is it
            for name in quantization_methods:
                method = me_quant.get_quantization_config(name)
                quantization_override = method.override_quantization_method(
                    quant_cfg, self.quantization, hf_config=self.hf_config
                )
                if quantization_override is not None:
                    # Raise error if the override is not custom (custom would
                    # be in QUANTIZATION_METHODS but not QuantizationMethods)
                    # and hasn't been added to the overrides list.
                    if (
                        name in get_args(me_quant.QuantizationMethods)
                        and name not in overrides
                    ):
                        raise ValueError(
                            f"Quantization method {name} is an override but "
                            "is has not been added to the `overrides` list "
                            "above. This is necessary to ensure that the "
                            "overrides are checked in order of preference."
                        )
                    quant_method = quantization_override
                    self.quantization = quantization_override
                    break

            quant_method = quant_method if quant_method != "" else None
            # Verify quantization configurations.
            if self.quantization is None:
                self.quantization = quant_method
            elif self.quantization != quant_method:
                raise ValueError(
                    "Quantization method specified in the model config "
                    f"({quant_method}) does not match the quantization "
                    f"method specified in the `quantization` argument "
                    f"({self.quantization})."
                )

        if self.quantization is not None:
            if self.quantization not in supported_quantization:
                raise ValueError(
                    f"Unknown quantization method: {self.quantization}. Must "
                    f"be one of {supported_quantization}."
                )
            current_platform.verify_quantization(self.quantization)

        if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
            if self.allow_deprecated_quantization:
                logger.warning(
                    "The quantization method %s is deprecated "
                    "and will be removed in future versions of vLLM.",
                    self.quantization,
                )
            else:
                raise ValueError(
                    "The quantization method %s is deprecated "
                    "and will be removed in future versions of vLLM. To bypass, "
                    "set `--allow-deprecated-quantization`.",
                    self.quantization,
                )

    def _verify_cuda_graph(self) -> None:
        # CUDAGraph capture not supported for encoder-decoder models on ROCm
        unsupported_rocm = self.is_encoder_decoder
        if unsupported_rocm and not self.enforce_eager and current_platform.is_rocm():
            logger.warning(
                "CUDA graph is not supported for %s on ROCm yet, fallback "
                "to eager mode.",
                self.model_arch_config.model_type,
            )
            self.enforce_eager = True

    def _verify_bnb_config(self) -> None:
        """
        The current version of bitsandbytes (0.46.1) with 8-bit models does not
        yet support CUDA graph.
        # TODO Remove this when bitsandbytes supports.
        """
        is_bitsandbytes = self.quantization == "bitsandbytes"
        has_quantization_config = self.model_arch_config.quantization_config is not None
        is_8bit = (
            self.model_arch_config.quantization_config.get("load_in_8bit", False)  # type: ignore[union-attr]
            if has_quantization_config
            else False
        )
        if all(
            [
                is_bitsandbytes,
                has_quantization_config,
                is_8bit,
                not self.enforce_eager,
            ]
        ):
            logger.warning(
                "CUDA graph is not supported on BitsAndBytes 8bit yet, "
                "fallback to the eager mode."
            )

            self.enforce_eager = True

    def _verify_with_expert_parallelism(self) -> None:
        if not self.is_moe:
            raise ValueError(
                "Number of experts in the model must be greater than 0 "
                "when expert parallelism is enabled."
            )

    def _try_verify_and_update_model_config(self):
        # Avoid running try_verify_and_update_config multiple times
        if getattr(self, "config_updated", False):
            return

        architecture = self.architecture
        if architecture is None:
            return

        from vllm.model_executor.models.config import (
            MODELS_CONFIG_MAP,
        )

        cls = MODELS_CONFIG_MAP.get(architecture, None)
        if cls is not None:
            cls.verify_and_update_model_config(self)

    def verify_dual_chunk_attention_config(
        self,
        load_config: LoadConfig,
    ) -> None:
        if hasattr(self.hf_config, "dual_chunk_attention_config"):
            # Try loading the sparse attention config
            from vllm.model_executor.model_loader.weight_utils import (
                get_sparse_attention_config,
            )

            sparse_attn_config = get_sparse_attention_config(self, load_config)
            if sparse_attn_config:
                self.hf_config.dual_chunk_attention_config[
                    "sparse_attention_config"
                ] = sparse_attn_config
                if (
                    "sparse_attention_enabled"
                    not in self.hf_config.dual_chunk_attention_config
                ):
                    self.hf_config.dual_chunk_attention_config[
                        "sparse_attention_enabled"
                    ] = True

    def verify_with_parallel_config(
        self,
        parallel_config: ParallelConfig,
    ) -> None:
        total_num_attention_heads = self.model_arch_config.total_num_attention_heads
        tensor_parallel_size = parallel_config.tensor_parallel_size
        if total_num_attention_heads % tensor_parallel_size != 0:
            raise ValueError(
                f"Total number of attention heads ({total_num_attention_heads})"
                " must be divisible by tensor parallel size "
                f"({tensor_parallel_size})."
            )

        if parallel_config.enable_expert_parallel:
            self._verify_with_expert_parallelism()

        pipeline_parallel_size = parallel_config.pipeline_parallel_size
        if pipeline_parallel_size > 1 and not self.registry.is_pp_supported_model(
            self.architectures, self
        ):
            raise NotImplementedError(
                "Pipeline parallelism is not supported for this model. "
                "Supported models implement the `SupportsPP` interface."
            )

        decode_context_parallel_size = parallel_config.decode_context_parallel_size
        if decode_context_parallel_size > 1 and not self.use_mla:
            total_num_kv_heads = self.get_total_num_kv_heads()
            assert tensor_parallel_size > total_num_kv_heads, (
                f"tensor parallel size {tensor_parallel_size} must be greater "
                f"than total num kv heads {total_num_kv_heads} when enable "
                f"decode context parallel for GQA/MQA"
            )

            max_dcp_size = tensor_parallel_size // total_num_kv_heads
            assert decode_context_parallel_size <= max_dcp_size, (
                f"decode context parallel size must less than or equal to "
                f"(tensor parallel size {tensor_parallel_size} // total "
                f"num kv heads {total_num_kv_heads}) = {max_dcp_size}, "
                f"but got {decode_context_parallel_size}"
            )

            num_q_per_kv = total_num_attention_heads // total_num_kv_heads
            assert num_q_per_kv % decode_context_parallel_size == 0, (
                f"Total number of q per kv attn heads ({num_q_per_kv})"
                " must be divisible by dcp world size when enable "
                "decode context parallel for GQA "
                f"({parallel_config.decode_context_parallel_size})."
            )

        # torch_shm uses a single IPC queue to rank 0; DP>1 is
        # incompatible because API servers can't know which
        # CoreEngine the scheduler will assign work to. TP>1 is
        # also not supported because this requires broadcasting
        # MM tensors between all TP ranks.
        if (
            self.multimodal_config is not None
            and self.multimodal_config.mm_tensor_ipc == "torch_shm"
            and parallel_config.world_size_across_dp > 1
        ):
            raise ValueError(
                "mm_tensor_ipc='torch_shm' is not supported with "
                "data_parallel_size > 1 or tensor_parallel_size > 1 "
                "or pipeline_parallel_size > 1."
            )

    def get_sliding_window(self) -> int | None:
        """Get the sliding window size from the HF text config if present."""
        return getattr(self.hf_text_config, "sliding_window", None)

    def get_vocab_size(self) -> int:
        return self.model_arch_config.vocab_size

    def get_hidden_size(self) -> int:
        return self.model_arch_config.hidden_size

    def get_inputs_embeds_size(self) -> int:
        # The size of inputs_embeds is usually identical to the size
        # of the hidden states, however there are exceptions, such as
        # embedding models like CLIP and SigLIP
        names = ("projection_dim", "projection_size")
        return getattr_iter(
            self.hf_text_config, names, default_factory=self.get_hidden_size
        )

    @property
    def is_deepseek_mla(self) -> bool:
        return self.model_arch_config.is_deepseek_mla

    @property
    def is_mm_prefix_lm(self) -> bool:
        return self.model_arch_config.is_mm_prefix_lm

    @property
    def rswa_window(self) -> int | None:
        return self.model_arch_config.rswa_window

    def get_head_size(self) -> int:
        return self.model_arch_config.head_size

    def get_total_num_kv_heads(self) -> int:
        """Returns the total number of KV heads."""
        return self.model_arch_config.total_num_kv_heads

    def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
        """Returns the number of KV heads per GPU."""
        if self.use_mla:
            # When using MLA during decode it becomes MQA
            return 1

        total_num_kv_heads = self.get_total_num_kv_heads()
        # If tensor parallelism is used, we divide the number of KV heads by
        # the tensor parallel size. We will replicate the KV heads in the
        # case where the number of KV heads is smaller than the tensor
        # parallel size so each GPU has at least one KV head.
        return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)

    def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int:
        num_heads = self.model_arch_config.total_num_attention_heads
        return num_heads // parallel_config.tensor_parallel_size

    def get_num_experts(self) -> int:
        return self.model_arch_config.num_experts

    def get_total_num_hidden_layers(self) -> int:
        return self.model_arch_config.total_num_hidden_layers

    def get_layers_start_end_indices(
        self, parallel_config: ParallelConfig
    ) -> tuple[int, int]:
        from vllm.distributed.utils import get_pp_indices

        total_num_hidden_layers = self.get_total_num_hidden_layers()

        # the layout order is: DP x PP x TP
        pp_rank = (
            parallel_config.rank // parallel_config.tensor_parallel_size
        ) % parallel_config.pipeline_parallel_size
        pp_size = parallel_config.pipeline_parallel_size
        start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
        return start, end

    def get_num_layers(self, parallel_config: ParallelConfig) -> int:
        start, end = self.get_layers_start_end_indices(parallel_config)
        return end - start

    def get_num_layers_by_block_type(
        self,
        parallel_config: ParallelConfig,
        block_type: LayerBlockType = "attention",
    ) -> int:
        # This function relies on 'layers_block_type' in hf_config,
        # for w/o this attribute, we will need to have workarounds like so
        attn_block_type = block_type == "attention"
        is_transformer = (
            not self.is_hybrid and not self.has_noops and not self.is_attention_free
        )
        start, end = self.get_layers_start_end_indices(parallel_config)

        if is_transformer:
            # Handle the basic case first
            return end - start if attn_block_type else 0
        elif self.is_attention_free:
            # Attention free
            # Note that this code assumes there
            # is only one type of attention-free block type.
            return 0 if attn_block_type else end - start
        elif self.has_noops:
            block_configs = self.hf_config.block_configs
            return sum(not bc.attention.no_op for bc in block_configs[start:end])
        else:
            # Hybrid model Jamba
            layers_block_type_value = getattr(
                self.hf_text_config, "layers_block_type", None
            )
            if layers_block_type_value is not None:
                if self.model_arch_config.text_model_type == "zamba2":
                    if attn_block_type:
                        return sum(
                            t == "hybrid" for t in layers_block_type_value[start:end]
                        )
                    else:
                        return self.get_num_layers(parallel_config)
                return sum(t == block_type for t in layers_block_type_value[start:end])

            # Hybrid model Minimax
            attn_type_list = getattr(self.hf_config, "attn_type_list", None)
            if attn_type_list:
                return sum(t == 1 for t in attn_type_list[start:end])

            # Hybrid model Qwen3Next Qwen3.5 Series
            layer_types_value = getattr(self.hf_text_config, "layer_types", None)
            if layer_types_value is not None:
                if block_type == "attention":
                    return sum(
                        t == "full_attention" for t in layer_types_value[start:end]
                    )
                elif block_type == "linear_attention":
                    return sum(
                        t == "linear_attention" for t in layer_types_value[start:end]
                    )
                else:
                    return sum(t == block_type for t in layer_types_value[start:end])

            if (
                layers_block_type_value is None
                and attn_type_list is None
                and layer_types_value is None
            ):
                raise ValueError(
                    "The model is an hybrid without a layers_block_type or an "
                    "attn_type_list, or a layer_types in the hf_config, "
                    f"cannot determine the num of {block_type} layers"
                )
            raise AssertionError(f"Unsupported block type: {block_type}")

    def get_mamba_chunk_size(self) -> int:
        """
        Returns the mamba chunk size if it exists
        """
        # used by e.g. Bamba, FalconH1, Granite, PLaMo2
        chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None)
        if chunk_size is None:
            # used by e.g. Mamba2, NemotronH, Zamba
            chunk_size = getattr(self.hf_text_config, "chunk_size", None)

        # Since Mamba1 does not have a chunk notion
        # we use a default chunk size of 2048.
        if chunk_size is None:
            chunk_size = 2048

        return chunk_size

    def get_multimodal_config(self) -> MultiModalConfig:
        """
        Get the multimodal configuration of the model.

        Raises:
            ValueError: If the model is not multimodal.
        """
        if self.multimodal_config is None:
            raise ValueError("The model is not multimodal.")

        return self.multimodal_config

    def try_get_generation_config(self) -> dict[str, Any]:
        """
        This method attempts to retrieve the non-default values of the
        generation config for this model.

        The generation config can contain information about special tokens, as
        well as sampling parameters. Which is why this method exists separately
        to `get_diff_sampling_param`.

        Returns:
            A dictionary containing the non-default generation config.
        """
        if self.generation_config in {"auto", "vllm"}:
            config = try_get_generation_config(
                self.hf_config_path or self.model,
                trust_remote_code=self.trust_remote_code,
                revision=self.revision,
                config_format=self.config_format,
                hf_token=self.hf_token,
            )
        else:
            config = try_get_generation_config(
                self.generation_config,
                trust_remote_code=self.trust_remote_code,
                config_format=self.config_format,
                hf_token=self.hf_token,
            )

        if config is None:
            return {}

        return config.to_diff_dict()

    def get_diff_sampling_param(self) -> dict[str, Any]:
        """
        This method returns a dictionary containing the non-default sampling
        parameters with `override_generation_config` applied.

        The default sampling parameters are:

        - vLLM's neutral defaults if `self.generation_config="vllm"`
        - the model's defaults if `self.generation_config="auto"`
        - as defined in `generation_config.json` if
            `self.generation_config="path/to/generation_config/dir"`

        Returns:
            A dictionary containing the non-default sampling parameters.
        """
        src = self.generation_config

        config = {} if src == "vllm" else self.try_get_generation_config()

        # Overriding with given generation config
        config.update(self.override_generation_config)

        available_params = [
            "repetition_penalty",
            "temperature",
            "top_k",
            "top_p",
            "min_p",
            "max_new_tokens",
        ]
        if any(p in config for p in available_params):
            diff_sampling_param = {
                p: config.get(p) for p in available_params if config.get(p) is not None
            }
            # Huggingface definition of max_new_tokens is equivalent
            # to vLLM's max_tokens
            if "max_new_tokens" in diff_sampling_param:
                diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
                    "max_new_tokens"
                )
        else:
            diff_sampling_param = {}

        if diff_sampling_param and src != "vllm":
            logger.warning_once(
                "Default vLLM sampling parameters have been overridden by %s: `%s`. "
                "If this is not intended, please relaunch vLLM instance "
                "with `--generation-config vllm`.",
                "the model's `generation_config.json`" if src == "auto" else src,
                str(diff_sampling_param),
                scope="local",
            )

        return diff_sampling_param

    def get_pooling_task(
        self, supported_tasks: tuple[SupportedTask, ...]
    ) -> PoolingTask | None:
        if self.pooler_config is None:
            return None

        pooling_task = self.pooler_config.task

        if pooling_task is not None:
            if self.pooler_config.task in supported_tasks:
                return self.pooler_config.task
            else:
                raise RuntimeError(
                    f"Unsupported task: {pooling_task!r} "
                    f"Supported tasks: {supported_tasks}"
                )

        if "token_classify" in supported_tasks:
            for architecture in self.architectures:
                if "ForTokenClassification" in architecture:
                    return "token_classify"

        priority: list[PoolingTask] = [
            "embed&token_classify",
            "embed",
            "classify",
            "token_embed",
            "token_classify",
            "plugin",
        ]
        for task in priority:
            if task in supported_tasks:
                return task
        return None

    @cached_property
    def is_encoder_decoder(self) -> bool:
        """Extract the HF encoder/decoder model flag."""
        return is_encoder_decoder(self.hf_config)

    @cached_property
    def is_diffusion(self) -> bool:
        """Detect discrete diffusion (dLLM) models from HF config."""
        return getattr(self.hf_config, "canvas_length", None) is not None

    @property
    def uses_alibi(self) -> bool:
        cfg = self.hf_text_config

        return (
            getattr(cfg, "alibi", False)  # Falcon
            or "BloomForCausalLM" in self.architectures  # Bloom
            or getattr(cfg, "position_encoding_type", "") == "alibi"  # codellm_1b_alibi
            or (
                hasattr(cfg, "attn_config")  # MPT
                and (
                    (
                        isinstance(cfg.attn_config, dict)
                        and cfg.attn_config.get("alibi", False)
                    )
                    or (
                        not isinstance(cfg.attn_config, dict)
                        and getattr(cfg.attn_config, "alibi", False)
                    )
                )
            )
        )

    @property
    def uses_mrope(self) -> bool:
        return uses_mrope(self.hf_config)

    @property
    def uses_xdrope_dim(self) -> int:
        return uses_xdrope_dim(self.hf_config)

    @property
    def is_multimodal_model(self) -> bool:
        return self.multimodal_config is not None

    @property
    def is_multimodal_raw_input_only_model(self) -> bool:
        return self._model_info.supports_multimodal_raw_input_only

    @property
    def requires_raw_input_tokens(self) -> bool:
        return self._model_info.requires_raw_input_tokens

    @property
    def score_type(self) -> ScoreType:
        """
        Scoring API handles score/rerank for:

        - "classify" task (score_type: cross-encoder models)
        - "embed" task (score_type: bi-encoder models)
        - "token_embed" task (score_type: late interaction models)
        """
        # fixme: self._model_info.score_type is the score type before
        #  as_seq_cls_model, which is "bi-encoder", rather than the
        #  score type after as_seq_cls_model, which is "cross-encoder".
        #  Therefore, the following logic is required.
        return (
            "cross-encoder"
            if self.convert_type == "classify"
            else self._model_info.score_type
        )

    @property
    def is_pp_supported(self) -> bool:
        return self._model_info.supports_pp

    @property
    def is_attention_free(self) -> bool:
        return self._model_info.is_attention_free

    @property
    def is_hybrid(self) -> bool:
        if not self._model_info.is_hybrid:
            return False
        # Handle granite-4.0-micro case which uses hybrid config but does not
        # actually contain any non-attention layers.
        layer_types = getattr(self.hf_config, "layer_types", None)
        return layer_types is None or not all(
            layer == "attention" for layer in layer_types
        )

    @property
    def has_noops(self) -> bool:
        return self._model_info.has_noops

    @property
    def has_inner_state(self):
        return self._model_info.has_inner_state

    @property
    def supports_mamba_prefix_caching(self) -> bool:
        return self._model_info.supports_mamba_prefix_caching

    @property
    def use_mla(self) -> bool:
        return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE

    @property
    def is_matryoshka(self) -> bool:
        return bool(getattr(self.hf_config, "matryoshka_dimensions", None)) or getattr(
            self.hf_config, "is_matryoshka", False
        )

    @property
    def matryoshka_dimensions(self):
        return getattr(self.hf_config, "matryoshka_dimensions", None)

    @property
    def use_sep_token(self) -> bool:
        # cross_encoder models defaults to using separating token.
        # `llm as reranker` defaults to not using separating token.

        use_pad_token = getattr(self.hf_config, "use_pad_token", None)
        if use_pad_token is not None:
            logger.warning_once(
                "use_pad_token has been deprecated; please use use_sep_token instead."
            )
            return use_pad_token

        return getattr(self.hf_config, "use_sep_token", True)

    @property
    def head_dtype(self) -> torch.dtype:
        """
        "head" refers to the last Linear layer(s) of an LLM,
        such as the lm_head in a generation model,
        or the score or classifier in a classification model.

        `head_dtype` currently only supports pooling models.

        - The pooling model defaults to using fp32 head, you can use
          --hf-overrides '{"head_dtype": "model"}' to disable it.
        """

        head_dtype = _get_head_dtype(
            config=self.hf_config, dtype=self.dtype, runner_type=self.runner_type
        )

        if self.runner_type != "pooling" and head_dtype != self.dtype:
            logger.warning_once(
                "`head_dtype` currently only supports pooling models, "
                "fallback to model dtype [%s].",
                self.dtype,
            )
            return self.dtype

        if head_dtype not in current_platform.supported_dtypes:
            logger.warning_once(
                "The current platform does not support [%s] head dtype, "
                "fallback to model dtype [%s].",
                head_dtype,
                self.dtype,
            )
            return self.dtype

        logger.debug_once("head dtype: %s", head_dtype)
        return head_dtype

    @property
    def embedding_size(self):
        # Check for embedding_size set by model config (e.g., Voyage models)
        override = getattr(self.hf_config, "embedding_size", None)
        if override is not None:
            return override
        dense_modules = try_get_dense_modules(self.model, revision=self.revision)
        if dense_modules is not None:
            return dense_modules[-1]["out_features"]
        return self.get_hidden_size()

    def get_and_verify_max_len(self, max_model_len: int):
        # Consider max_model_len in tokenizer_config only when
        # pooling models use absolute position_embedding.
        tokenizer_config = None
        if (
            self.runner_type == "pooling"
            and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
        ):
            tokenizer_config = try_get_tokenizer_config(
                self.tokenizer,
                trust_remote_code=self.trust_remote_code,
                revision=self.tokenizer_revision,
            )
        max_model_len = _get_and_verify_max_len(
            hf_config=self.hf_text_config,
            model_arch_config=self.model_arch_config,
            tokenizer_config=tokenizer_config,
            max_model_len=max_model_len,
            disable_sliding_window=self.disable_sliding_window,
            sliding_window=self.get_sliding_window(),
            spec_target_max_model_len=self.spec_target_max_model_len,
            encoder_config=self.encoder_config,
        )
        logger.info("Using max model len %s", max_model_len)
        return max_model_len

    @property
    def attn_type(self) -> AttnTypeStr:
        """Determine the attention type based on model configuration."""
        if self.pooler_config is not None:
            seq_pooling_type = self._model_info.default_seq_pooling_type
            if seq_pooling_type == "CLS":
                return "encoder_only"
            else:
                is_causal = getattr(self.hf_config, "is_causal", True)
                return "encoder_only" if not is_causal else self._model_info.attn_type
        elif self.is_hybrid:
            return "hybrid"
        elif self.is_attention_free:
            return "attention_free"
        elif self.is_encoder_decoder:
            return "encoder_decoder"
        else:
            return "decoder"

    @property
    def is_chunked_prefill_supported(self) -> bool:
        attn_type = self.attn_type

        if pooler_config := self.pooler_config:
            # for pooling models
            if attn_type == "encoder_only":
                logger.debug(
                    "Pooling models with bidirectional attn "
                    "do not support chunked prefill."
                )
                return False

            if attn_type == "decoder":
                if (
                    pooler_config.seq_pooling_type in ("MEAN", "CLS")
                    or pooler_config.tok_pooling_type == "STEP"
                ):
                    logger.debug(
                        "Pooling models with causal attn and %s/%s pooling "
                        "do not support chunked prefill.",
                        pooler_config.seq_pooling_type,
                        pooler_config.tok_pooling_type,
                    )
                    return False
                else:
                    logger.debug(
                        "Pooling models with causal attn and %s/%s pooling "
                        "support chunked prefill.",
                        pooler_config.seq_pooling_type,
                        pooler_config.tok_pooling_type,
                    )
                    return True

            # vllm currently does not have pooling models using hybrid,
            # attention_free or encoder_decoder attn types.
            return attn_type != "encoder_decoder"
        else:
            # for generative models
            if attn_type == "encoder_decoder":
                logger.debug("Encoder decoder models do not support chunked prefill.")
                return False

            logger.debug("Generative models support chunked prefill.")
            return True

    @property
    def is_prefix_caching_supported(self) -> bool:
        attn_type = self.attn_type

        if pooler_config := self.pooler_config:
            # for pooling models
            if attn_type == "encoder_only":
                logger.debug(
                    "Pooling models with bidirectional attn "
                    "do not support prefix caching."
                )
                return False

            if attn_type == "decoder":
                if (
                    pooler_config.seq_pooling_type in ("MEAN", "CLS")
                    or pooler_config.tok_pooling_type == "STEP"
                ):
                    logger.debug(
                        "Pooling models with causal attn and %s/%s pooling "
                        "do not support prefix caching.",
                        pooler_config.seq_pooling_type,
                        pooler_config.tok_pooling_type,
                    )
                    return False
                else:
                    logger.debug(
                        "Pooling models with causal attn and %s/%s pooling "
                        "support prefix caching.",
                        pooler_config.seq_pooling_type,
                        pooler_config.tok_pooling_type,
                    )
                    return True

            # vllm currently does not have pooling models using hybrid,
            # attention_free or encoder_decoder attn types.
            return False
        else:
            # for generative models
            if attn_type == "hybrid":
                logger.debug(
                    "Hybrid models do not support prefix caching since the feature "
                    "is still experimental."
                )
                return False
            elif attn_type == "attention_free":
                logger.debug(
                    "Attention free models do not support prefix caching since the "
                    "feature is still experimental."
                )
                return False
            elif attn_type == "encoder_decoder":
                logger.debug("Encoder decoder models do not support prefix caching.")
                return False
            else:  # attn_type == "decoder"
                logger.debug("Generative models support prefix caching.")
                return True

    @property
    def is_moe(self) -> bool:
        return self.get_num_experts() > 0

    @property
    def is_quantized(self) -> bool:
        return getattr(self.hf_config, "quantization_config", None) is not None

    def is_nvfp4_quantized(self) -> bool:
        # ModelOpt NVFP4 checkpoints resolve to modelopt_fp4 quantization method
        if self.quantization in ("modelopt_fp4",):
            return True

        # For Compressed Tensors we look for `"format": "nvfp4-pack-quantized"`
        # in the quantization config
        quant_config = self.model_arch_config.quantization_config
        return (
            self.quantization == "compressed-tensors"
            and quant_config is not None
            and "nvfp4" in quant_config.get("format", "").lower()
        )

allow_deprecated_quantization = False class-attribute instance-attribute

Whether to allow deprecated quantization methods.

allowed_local_media_path = '' class-attribute instance-attribute

Allowing API requests to read local images or videos from directories specified by the server file system. This is a security risk. Should only be enabled in trusted environments.

allowed_media_domains = None class-attribute instance-attribute

If set, only media URLs that belong to this domain can be used for multi-modal inputs.

architecture property

The architecture vllm actually used.

attn_type property

Determine the attention type based on model configuration.

code_revision = None class-attribute instance-attribute

The specific revision to use for the model code on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.

config_format = 'auto' class-attribute instance-attribute

The format of the model config to load:

  • "auto" will try to load the config in hf format if available after trying to load in mistral format.
  • "hf" will load the config in hf format.
  • "mistral" will load the config in mistral format.

convert = 'auto' class-attribute instance-attribute

Convert the model using adapters defined in vllm.model_executor.models.adapters. The most common use case is to adapt a text generation model to be used for pooling tasks.

disable_cascade_attn = True class-attribute instance-attribute

Disable cascade attention for V1. While cascade attention does not change the mathematical correctness, disabling it could be useful for preventing potential numerical issues. This defaults to True, so users must opt in to cascade attention by setting this to False. Even when this is set to False, cascade attention will only be used when the heuristic tells that it's beneficial.

disable_sliding_window = False class-attribute instance-attribute

Whether to disable sliding window. If True, we will disable the sliding window functionality of the model, capping to sliding window size. If the model does not support sliding window, this argument is ignored.

dtype = 'auto' class-attribute instance-attribute

Data type for model weights and activations:

  • "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
  • "half" for FP16. Recommended for AWQ quantization.
  • "float16" is the same as "half".
  • "bfloat16" for a balance between precision and range.
  • "float" is shorthand for FP32 precision.
  • "float32" for FP32 precision.

enable_cumem_allocator = False class-attribute instance-attribute

Enable the custom cumem allocator to leverage advanced GPU memory allocation features such as multi-node NVLink support.

Sleep mode automatically enables this allocator. Only cuda and hip platforms are supported.

enable_prompt_embeds = False class-attribute instance-attribute

If True, enables passing text embeddings as inputs via the prompt_embeds key.

WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users!

enable_return_routed_experts = False class-attribute instance-attribute

Whether to return routed experts.

enable_sleep_mode = False class-attribute instance-attribute

Enable sleep mode for the engine (only cuda and hip platforms are supported).

enforce_eager = False class-attribute instance-attribute

Whether to always use eager-mode PyTorch. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid for maximal performance and flexibility.

generation_config = 'auto' class-attribute instance-attribute

The folder path to the generation config. Defaults to "auto", the generation config will be loaded from model path. If set to "vllm", no generation config is loaded, vLLM defaults will be used. If set to a folder path, the generation config will be loaded from the specified folder path. If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests.

head_dtype property

"head" refers to the last Linear layer(s) of an LLM, such as the lm_head in a generation model, or the score or classifier in a classification model.

head_dtype currently only supports pooling models.

  • The pooling model defaults to using fp32 head, you can use --hf-overrides '{"head_dtype": "model"}' to disable it.

hf_config = field(init=False) class-attribute instance-attribute

The Hugging Face config of the model.

hf_config_path = None class-attribute instance-attribute

Name or path of the Hugging Face config to use. If unspecified, model name or path will be used.

hf_overrides = field(default_factory=dict) class-attribute instance-attribute

If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.

hf_text_config = field(init=False) class-attribute instance-attribute

The Hugging Face config of the text model (same as hf_config for text models).

hf_token = None class-attribute instance-attribute

The token to use as HTTP bearer authorization for remote files . If True, will use the token generated when running hf auth login (stored in ~/.cache/huggingface/token).

io_processor_plugin = None class-attribute instance-attribute

IOProcessor plugin name to load at model startup

is_diffusion cached property

Detect discrete diffusion (dLLM) models from HF config.

is_encoder_decoder cached property

Extract the HF encoder/decoder model flag.

logits_processors = None class-attribute instance-attribute

One or more logits processors' fully-qualified class names or class definitions

logprobs_mode = 'raw_logprobs' class-attribute instance-attribute

Indicates the content returned in the logprobs and prompt_logprobs. Supported mode: 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. Raw means the values before applying any logit processors, like bad words. Processed means the values after applying all processors, including temperature and top_k/top_p.

max_logprobs = Field(default=20, ge=(-1)) class-attribute instance-attribute

Maximum number of log probabilities to return when logprobs is specified in SamplingParams. The default value comes the default for the OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * vocab_size) logprobs are allowed to be returned and it may cause OOM.

max_model_len = Field(default=None, ge=(-1)) class-attribute instance-attribute

Model context length (prompt and output). If unspecified, will be automatically derived from the model config.

When passing via --max-model-len, supports k/m/g/K/M/G in human-readable format. Examples:

  • 1k -> 1000
  • 1K -> 1024
  • 25.6k -> 25,600
  • -1 or 'auto' -> Automatically choose the maximum model length that fits in GPU memory. This will use the model's maximum context length if it fits, otherwise it will find the largest length that can be accommodated.

model = 'Qwen/Qwen3-0.6B' class-attribute instance-attribute

Name or path of the Hugging Face model to use. It is also used as the content for model_name tag in metrics output when served_model_name is not specified.

model_impl = 'auto' class-attribute instance-attribute

Which implementation of the model to use:

  • "auto" will try to use the vLLM implementation, if it exists, and fall back to the Transformers implementation if no vLLM implementation is available.
  • "vllm" will use the vLLM model implementation.
  • "transformers" will use the Transformers model implementation.
  • "terratorch" will use the TerraTorch model implementation.

model_weights = '' class-attribute instance-attribute

Original model weights path. Used when the model is pulled from object storage (e.g., RunAI) to preserve the original URI while model points to the local directory.

multimodal_config = None class-attribute instance-attribute

Configuration for multimodal model. If None, this will be inferred from the architecture of self.model.

override_attention_dtype = None class-attribute instance-attribute

Override dtype for attention

override_generation_config = field(default_factory=dict) class-attribute instance-attribute

Overrides or sets generation config. e.g. {"temperature": 0.5}. If used with --generation-config auto, the override parameters will be merged with the default config from the model. If used with --generation-config vllm, only the override parameters are used.

pooler_config = None class-attribute instance-attribute

Pooler config which controls the behaviour of output pooling in pooling models.

quantization = None class-attribute instance-attribute

Method used to quantize the weights. If None, we first check the quantization_config attribute in the model config file. If that is None, we assume the model weights are not quantized and use dtype to determine the data type of the weights.

quantization_config = None class-attribute instance-attribute

User-facing quantization configuration. Carries per-layer-kind specs (linear, moe) and ignore patterns; see :class:QuantizationConfigArgs. Auto-populated from the matching online shorthand when quantization is one of the values in ONLINE_QUANT_SHORTHAND_NAMES.

renderer_num_workers = 1 class-attribute instance-attribute

Number of worker threads in the renderer thread pool. The pool is consumed by the async renderer path (e.g. the OpenAI-compatible API server started by vllm serve) to parallelize tokenization, chat template rendering, and multimodal preprocessing across concurrent requests.

The offline LLM entrypoint uses the synchronous renderer path and processes prompts (including multimodal preprocessing) serially, so this setting has no effect there.

revision = None class-attribute instance-attribute

The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.

runner = 'auto' class-attribute instance-attribute

The type of model runner to use. Each vLLM instance only supports one model runner, even if the same model can be used for multiple types.

score_type property

Scoring API handles score/rerank for:

  • "classify" task (score_type: cross-encoder models)
  • "embed" task (score_type: bi-encoder models)
  • "token_embed" task (score_type: late interaction models)

seed = 0 class-attribute instance-attribute

Random seed for reproducibility.

We must set the global seed because otherwise, different tensor parallel workers would sample different tokens, leading to inconsistent results.

served_model_name = None class-attribute instance-attribute

The model name(s) used in the API. If multiple names are provided, the server will respond to any of the provided names. The model name in the model field of a response will be the first name in this list. If not specified, the model name will be the same as the --model argument. Noted that this name(s) will also be used in model_name tag content of prometheus metrics, if multiple names provided, metrics tag will take the first one.

skip_tokenizer_init = False class-attribute instance-attribute

Skip initialization of tokenizer and detokenizer. Expects valid prompt_token_ids and None for prompt from the input. The generated output will contain token ids.

spec_target_max_model_len = None class-attribute instance-attribute

Specify the maximum length for spec decoding draft models.

tokenizer = None class-attribute instance-attribute

Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.

tokenizer_mode = 'auto' class-attribute instance-attribute

Tokenizer mode:

  • "auto" will use the tokenizer from mistral_common for Mistral models if available, otherwise it will use the "hf" tokenizer.
  • "hf" will use the fast tokenizer if available.
  • "slow" will always use the slow tokenizer.
  • "mistral" will always use the tokenizer from mistral_common.
  • "deepseek_v32" will always use the tokenizer from deepseek_v32.
  • "deepseek_v4" will always use the tokenizer from deepseek_v4.
  • Other custom values can be supported via plugins.

To swap the Rust BPE backend that powers HF fast tokenizers for the fastokens implementation, set VLLM_USE_FASTOKENS=1 instead — that override applies to any mode that loads an HF fast tokenizer (hf, deepseek_v32, deepseek_v4, …).

tokenizer_revision = None class-attribute instance-attribute

The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.

trust_remote_code = False class-attribute instance-attribute

Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.

use_fp64_gumbel = False class-attribute instance-attribute

Whether to use FP64 (instead of FP32) random noise for Gumbel-max and equivalent exponential-race sampling. FP64 preserves lower-tail sampling events that fp32 uniform/exponential draws can truncate, at the cost of significantly lower throughput on most GPUs.

_apply_dict_overrides(config, overrides)

Apply dict overrides, handling both nested configs and dict values.

Source code in vllm/config/model.py
def _apply_dict_overrides(
    self,
    config: PretrainedConfig,
    overrides: dict[str, Any],
) -> None:
    """Apply dict overrides, handling both nested configs and dict values."""
    from transformers import PretrainedConfig

    for key, value in overrides.items():
        attr = getattr(config, key, None)
        if attr is not None and isinstance(attr, PretrainedConfig):
            # It's a nested config - recursively update it
            self._update_nested(attr, value)
        else:
            # It's a dict-valued parameter - set it directly
            setattr(config, key, value)

_get_transformers_backend_cls()

Determine which Transformers modeling backend class will be used if model_impl is set to transformers or auto.

Source code in vllm/config/model.py
def _get_transformers_backend_cls(self) -> str:
    """Determine which Transformers modeling backend class will be used if
    `model_impl` is set to `transformers` or `auto`."""
    cls = "Transformers"
    # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
    cls += "MultiModal" if self.hf_config != self.hf_text_config else ""
    cls += "MoE" if self.is_moe else ""
    # Check if the architecture we're wrapping has defaults
    runner = None
    task = None
    if defaults := try_match_architecture_defaults(self.architectures[0]):
        _, (runner, task) = defaults
    # User specified value take precedence
    if self.runner != "auto":
        runner = self.runner
    # Only consider Transformers modeling backend pooling classes if we're wrapping
    # an architecture that defaults to pooling. Otherwise, we return the LM class
    # and use adapters.
    if runner == "pooling" and task in {"embed", "classify"}:
        if task == "embed":
            cls += "EmbeddingModel"
        elif task == "classify":
            cls += "ForSequenceClassification"
    else:
        cls += "ForCausalLM"
    return cls

_skip_none_validation(value, handler) classmethod

Skip validation if the value is None when initialisation is delayed.

Source code in vllm/config/model.py
@field_validator("tokenizer", "max_model_len", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
    """Skip validation if the value is `None` when initialisation is delayed."""
    if value is None:
        return value
    return handler(value)

_update_nested(target, updates)

Recursively updates a config or dict with nested updates.

Source code in vllm/config/model.py
def _update_nested(
    self,
    target: PretrainedConfig | dict[str, Any],
    updates: dict[str, Any],
) -> None:
    """Recursively updates a config or dict with nested updates."""
    for key, value in updates.items():
        if isinstance(value, dict):
            # Get the nested target
            if isinstance(target, dict):
                nested_target = target.get(key)
            else:
                nested_target = getattr(target, key, None)

            # If nested target exists and can be updated recursively
            if nested_target is not None and (
                isinstance(nested_target, dict)
                or hasattr(nested_target, "__dict__")
            ):
                self._update_nested(nested_target, value)
                continue

        # Set the value (base case)
        if isinstance(target, dict):
            target[key] = value
        else:
            setattr(target, key, value)

_verify_bnb_config()

The current version of bitsandbytes (0.46.1) with 8-bit models does not yet support CUDA graph.

TODO Remove this when bitsandbytes supports.

Source code in vllm/config/model.py
def _verify_bnb_config(self) -> None:
    """
    The current version of bitsandbytes (0.46.1) with 8-bit models does not
    yet support CUDA graph.
    # TODO Remove this when bitsandbytes supports.
    """
    is_bitsandbytes = self.quantization == "bitsandbytes"
    has_quantization_config = self.model_arch_config.quantization_config is not None
    is_8bit = (
        self.model_arch_config.quantization_config.get("load_in_8bit", False)  # type: ignore[union-attr]
        if has_quantization_config
        else False
    )
    if all(
        [
            is_bitsandbytes,
            has_quantization_config,
            is_8bit,
            not self.enforce_eager,
        ]
    ):
        logger.warning(
            "CUDA graph is not supported on BitsAndBytes 8bit yet, "
            "fallback to the eager mode."
        )

        self.enforce_eager = True

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/model.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    ignored_factors = {
        "convert",
        "tokenizer",
        "tokenizer_mode",
        "seed",
        "hf_config_path",
        "allowed_local_media_path",
        "allowed_media_domains",
        "tokenizer_revision",
        "spec_target_max_model_len",
        "enforce_eager",
        "logprobs_mode",
        "use_fp64_gumbel",
        "disable_cascade_attn",
        "skip_tokenizer_init",
        "served_model_name",
        "config_format",
        "hf_token",
        "hf_overrides",
        "override_attention_dtype",
        "logits_processors",
        "io_processor_plugin",
        "pooler_config",
        "multimodal_config",
        "limit_mm_per_prompt",
        "media_io_kwargs",
        "mm_processor_kwargs",
        "mm_processor_cache_gb",
        "mm_processor_cache_type",
        "mm_shm_cache_max_object_size_mb",
        "mm_encoder_tp_mode",
        "interleave_mm_strings",
        "skip_mm_profiling",
        "mm_ipc_gpu_memory_gb",
    }

    from vllm.config.utils import get_hash_factors, hash_factors

    factors = get_hash_factors(self, ignored_factors)

    # NOTE: For some models (e.g, Qwen3-VL), whether the MM code path is enabled
    # affects the computation graph of the language model, therefore we add it
    # here early.
    if self.multimodal_config:
        factors["language_model_only"] = self.multimodal_config.language_model_only
    return hash_factors(factors)

get_diff_sampling_param()

This method returns a dictionary containing the non-default sampling parameters with override_generation_config applied.

The default sampling parameters are:

  • vLLM's neutral defaults if self.generation_config="vllm"
  • the model's defaults if self.generation_config="auto"
  • as defined in generation_config.json if self.generation_config="path/to/generation_config/dir"

Returns:

  • dict[str, Any]

    A dictionary containing the non-default sampling parameters.

Source code in vllm/config/model.py
def get_diff_sampling_param(self) -> dict[str, Any]:
    """
    This method returns a dictionary containing the non-default sampling
    parameters with `override_generation_config` applied.

    The default sampling parameters are:

    - vLLM's neutral defaults if `self.generation_config="vllm"`
    - the model's defaults if `self.generation_config="auto"`
    - as defined in `generation_config.json` if
        `self.generation_config="path/to/generation_config/dir"`

    Returns:
        A dictionary containing the non-default sampling parameters.
    """
    src = self.generation_config

    config = {} if src == "vllm" else self.try_get_generation_config()

    # Overriding with given generation config
    config.update(self.override_generation_config)

    available_params = [
        "repetition_penalty",
        "temperature",
        "top_k",
        "top_p",
        "min_p",
        "max_new_tokens",
    ]
    if any(p in config for p in available_params):
        diff_sampling_param = {
            p: config.get(p) for p in available_params if config.get(p) is not None
        }
        # Huggingface definition of max_new_tokens is equivalent
        # to vLLM's max_tokens
        if "max_new_tokens" in diff_sampling_param:
            diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
                "max_new_tokens"
            )
    else:
        diff_sampling_param = {}

    if diff_sampling_param and src != "vllm":
        logger.warning_once(
            "Default vLLM sampling parameters have been overridden by %s: `%s`. "
            "If this is not intended, please relaunch vLLM instance "
            "with `--generation-config vllm`.",
            "the model's `generation_config.json`" if src == "auto" else src,
            str(diff_sampling_param),
            scope="local",
        )

    return diff_sampling_param

get_mamba_chunk_size()

Returns the mamba chunk size if it exists

Source code in vllm/config/model.py
def get_mamba_chunk_size(self) -> int:
    """
    Returns the mamba chunk size if it exists
    """
    # used by e.g. Bamba, FalconH1, Granite, PLaMo2
    chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None)
    if chunk_size is None:
        # used by e.g. Mamba2, NemotronH, Zamba
        chunk_size = getattr(self.hf_text_config, "chunk_size", None)

    # Since Mamba1 does not have a chunk notion
    # we use a default chunk size of 2048.
    if chunk_size is None:
        chunk_size = 2048

    return chunk_size

get_multimodal_config()

Get the multimodal configuration of the model.

Raises:

Source code in vllm/config/model.py
def get_multimodal_config(self) -> MultiModalConfig:
    """
    Get the multimodal configuration of the model.

    Raises:
        ValueError: If the model is not multimodal.
    """
    if self.multimodal_config is None:
        raise ValueError("The model is not multimodal.")

    return self.multimodal_config

get_num_kv_heads(parallel_config)

Returns the number of KV heads per GPU.

Source code in vllm/config/model.py
def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
    """Returns the number of KV heads per GPU."""
    if self.use_mla:
        # When using MLA during decode it becomes MQA
        return 1

    total_num_kv_heads = self.get_total_num_kv_heads()
    # If tensor parallelism is used, we divide the number of KV heads by
    # the tensor parallel size. We will replicate the KV heads in the
    # case where the number of KV heads is smaller than the tensor
    # parallel size so each GPU has at least one KV head.
    return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)

get_sliding_window()

Get the sliding window size from the HF text config if present.

Source code in vllm/config/model.py
def get_sliding_window(self) -> int | None:
    """Get the sliding window size from the HF text config if present."""
    return getattr(self.hf_text_config, "sliding_window", None)

get_total_num_kv_heads()

Returns the total number of KV heads.

Source code in vllm/config/model.py
def get_total_num_kv_heads(self) -> int:
    """Returns the total number of KV heads."""
    return self.model_arch_config.total_num_kv_heads

maybe_pull_model_tokenizer_for_runai(model, tokenizer)

Pull model/tokenizer from Object Storage to temporary directory when needed.

Parameters:

  • model

    (str) –

    Model name or path

  • tokenizer

    (str) –

    Tokenizer name or path

Source code in vllm/config/model.py
def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
    """Pull model/tokenizer from Object Storage to temporary
    directory when needed.

    Args:
        model: Model name or path
        tokenizer: Tokenizer name or path
    """

    # Skip if model_weights is already set (model already pulled)
    if self.model_weights:
        return

    if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
        return

    if is_runai_obj_uri(model):
        object_storage_model = ObjectStorageModel(url=model)
        object_storage_model.pull_files(
            model, allow_pattern=["*.model", "*.py", "*.json"]
        )
        self.model_weights = model
        self.model = object_storage_model.dir

        # If tokenizer is same as model, download to same directory
        if model == tokenizer:
            object_storage_model.pull_files(
                model,
                ignore_pattern=[
                    "*.pt",
                    "*.safetensors",
                    "*.bin",
                    "*.tensors",
                    "*.pth",
                ],
            )
            self.tokenizer = object_storage_model.dir
            return

    # Only download tokenizer if needed and not already handled
    if is_runai_obj_uri(tokenizer):
        object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
        object_storage_tokenizer.pull_files(
            tokenizer,
            ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
        )
        self.tokenizer = object_storage_tokenizer.dir

try_get_generation_config()

This method attempts to retrieve the non-default values of the generation config for this model.

The generation config can contain information about special tokens, as well as sampling parameters. Which is why this method exists separately to get_diff_sampling_param.

Returns:

  • dict[str, Any]

    A dictionary containing the non-default generation config.

Source code in vllm/config/model.py
def try_get_generation_config(self) -> dict[str, Any]:
    """
    This method attempts to retrieve the non-default values of the
    generation config for this model.

    The generation config can contain information about special tokens, as
    well as sampling parameters. Which is why this method exists separately
    to `get_diff_sampling_param`.

    Returns:
        A dictionary containing the non-default generation config.
    """
    if self.generation_config in {"auto", "vllm"}:
        config = try_get_generation_config(
            self.hf_config_path or self.model,
            trust_remote_code=self.trust_remote_code,
            revision=self.revision,
            config_format=self.config_format,
            hf_token=self.hf_token,
        )
    else:
        config = try_get_generation_config(
            self.generation_config,
            trust_remote_code=self.trust_remote_code,
            config_format=self.config_format,
            hf_token=self.hf_token,
        )

    if config is None:
        return {}

    return config.to_diff_dict()

using_transformers_backend()

Check if the model is using the Transformers modeling backend class.

Source code in vllm/config/model.py
def using_transformers_backend(self) -> bool:
    """Check if the model is using the Transformers modeling backend class."""
    used_cls = self._model_info.architecture
    transformers_backend_cls = self._get_transformers_backend_cls()
    return used_cls == transformers_backend_cls

validate_model_config_after()

Called after post_init

Source code in vllm/config/model.py
@model_validator(mode="after")
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
    """Called after __post_init__"""
    if not isinstance(self.tokenizer, str):
        raise ValueError(
            f"tokenizer must be a string, got "
            f"{type(self.tokenizer).__name__}: {self.tokenizer!r}. "
            "Please provide a valid tokenizer path or HuggingFace model ID."
        )
    if not isinstance(self.max_model_len, int) or self.max_model_len < 1:
        raise ValueError(
            f"max_model_len must be a positive integer, "
            f"got {type(self.max_model_len).__name__}: {self.max_model_len!r}. "
            "Example: max_model_len=2048"
        )
    return self

MultiModalConfig

Controls the behavior of multimodal models.

Methods:

Attributes:

Source code in vllm/config/multimodal.py
@config
class MultiModalConfig:
    """Controls the behavior of multimodal models."""

    language_model_only: bool = False
    """If True, disables all multimodal inputs by setting all modality limits to 0.
    Equivalent to setting `--limit-mm-per-prompt` to 0 for every modality."""
    limit_per_prompt: MMDummyOptions = Field(default_factory=dict)
    """The maximum number of input items and options allowed per
    prompt for each modality.

    Defaults to 999 for each modality.

    Legacy format (count only):
        {"image": 16, "video": 2}

    Configurable format (with options):
        {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
        "image": {"count": 5, "width": 512, "height": 512}}

    Mixed format (combining both):
        {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
        "height": 512}}
    """
    enable_mm_embeds: bool = False
    """If `True`, enables passing multimodal embeddings:
    for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
    for the OpenAI-compatible server, this refers to chat messages with content
    `"type": "*_embeds"`.

    When enabled with `--limit-mm-per-prompt` set to 0 for a modality,
    precomputed embeddings skip count validation for that modality, 
    saving memory by not loading encoder modules while still enabling 
    embeddings as an input. Limits greater than 0 still apply to embeddings.

    WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
    Only enable this flag for trusted users!"""
    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
    """Additional args passed to process media inputs, keyed by modalities.
    For example, to set num_frames for video, set
    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
    mm_processor_kwargs: dict[str, object] | None = None
    """Arguments to be forwarded to the model's processor for multi-modal data,
    e.g., image processor. Overrides for the multi-modal processor obtained
    from `transformers.AutoProcessor.from_pretrained`.

    The available overrides depend on the model that is being run.

    For example, for Phi-3-Vision:
    `{"num_crops": 4}`."""
    mm_processor_cache_gb: float = Field(default=4, ge=0)
    """The size (in GiB) of the multi-modal processor cache, which is used to
    avoid re-processing past multi-modal inputs.

    This cache is duplicated for each API process and engine core process,
    resulting in a total memory usage of
    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.

    Set to `0` to disable this cache completely (not recommended)."""
    mm_processor_cache_type: MMCacheType = "lru"
    """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
    use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
    mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
    """Size limit (in MiB) for each object stored in the multi-modal processor
    shared memory cache. Only effective when `mm_processor_cache_type` is
    `"shm"`."""
    mm_encoder_only: bool = False
    """
    When enabled, skips the language component of the model.

    This is usually only valid in disaggregated Encoder process.
    """
    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
    """Indicates how to optimize multi-modal encoder inference using tensor
    parallelism (TP).

    - `"weights"`: Within the same vLLM engine, split the weights of
      each layer across TP ranks. (default TP behavior)
    - `"data"`: Within the same vLLM engine, split the batched input data
      across TP ranks to process the data in parallel, while hosting
      the full weights on each TP rank.
      This batch-level DP is not to be confused with API request-level
      DP (which is controlled by `--data-parallel-size`).
      This is only supported on a per-model basis and falls back to
      `"weights"` if the encoder does not support DP."""
    mm_encoder_attn_backend: AttentionBackendEnum | None = None
    """Optional override for the multi-modal encoder attention backend when
    using vision transformers. Accepts any value from
    `vllm.v1.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
    mm_encoder_attn_dtype: Literal["fp8"] | None = None
    """Optional dtype override for ViT encoder attention. Set to `"fp8"` to
    enable FP8 quantization via the FlashInfer cuDNN backend. When set to
    `"fp8"` without a scale file, dynamic scaling is used automatically.
    See docs/features/quantization/fp8_vit_attn.md for details."""
    mm_encoder_fp8_scale_path: str | None = None
    """Path to a JSON file containing per-layer FP8 Q/K/V scales for ViT
    encoder attention. When provided (with `mm_encoder_attn_dtype="fp8"`),
    static scaling is used. When omitted, dynamic scaling is used."""
    mm_encoder_fp8_scale_save_path: str | None = None
    """When set with dynamic FP8 scaling (`mm_encoder_attn_dtype="fp8"`
    and no `mm_encoder_fp8_scale_path`), saves the calibrated scales to
    this file after the amax history buffer is full. The saved file can
    then be used as `mm_encoder_fp8_scale_path` in subsequent runs."""
    mm_encoder_fp8_scale_save_margin: float = Field(default=1.5, gt=0.0)
    """Safety margin multiplied onto scales when auto-saving. A value > 1
    leaves headroom so that inputs with larger activations than the
    calibration set do not overflow FP8 range. Default 1.5."""
    interleave_mm_strings: bool = False
    """Enable fully interleaved support for multimodal prompts, while using
    --chat-template-content-format=string."""
    skip_mm_profiling: bool = False
    """When enabled, skips multimodal memory profiling and only profiles with
    language backbone model during engine initialization.

    This reduces engine startup time but shifts the responsibility to users for
    estimating the peak memory usage of the activation of multimodal encoder and
    embedding cache."""
    video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
    """Sets pruning rate for video pruning via Efficient Video Sampling.
    Value sits in range [0;1) and determines fraction of media tokens
    from each video to be pruned.
    """
    mm_tensor_ipc: MMTensorIPC = "direct_rpc"
    """IPC (inter-process communication) method for multimodal tensors.
    - "direct_rpc": Use msgspec serialization via RPC
    - "torch_shm": Use torch.multiprocessing shared memory for zero-copy IPC
    Defaults to "direct_rpc". """
    mm_ipc_gpu_memory_gb: float = Field(default=0, ge=0)
    """Amount of GPU memory (in GiB) sequestered on the engine's device for
    GPU-side multimodal work in the API-server (frontend) process, such as
    hardware video decoding.

    This budget is carved out of the engine's KV-cache memory so the headroom
    physically exists, and frontend GPU decode paths acquire from a blocking
    byte-counting semaphore of this size before allocating on the device.

    Set to `0` (default) to disable frontend GPU multimodal memory gating."""

    @field_validator("limit_per_prompt", mode="before")
    @classmethod
    def _validate_limit_per_prompt(
        cls,
        value: dict[str, int | dict[str, int]],
    ) -> MMDummyOptions:
        out: MMDummyOptions = {}

        for k, v in value.items():
            # Handle legacy format where only count is specified
            if isinstance(v, int):
                v = {"count": v}

            # Convert to the appropriate DummyOptions subclass
            if k == "video":
                out[k] = VideoDummyOptions(**v)
            elif k == "image":
                out[k] = ImageDummyOptions(**v)
            elif k == "audio":
                out[k] = AudioDummyOptions(**v)
            else:
                out[k] = BaseDummyOptions(**v)

        return out

    @field_validator("mm_encoder_attn_backend", mode="before")
    @classmethod
    def _validate_mm_encoder_attn_backend(
        cls, value: str | AttentionBackendEnum | None
    ) -> AttentionBackendEnum | None:
        if isinstance(value, str) and value.upper() == "XFORMERS":
            raise ValueError(
                "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
                "details). Please select a supported attention backend."
            )

        if value is None or isinstance(value, AttentionBackendEnum):
            return value

        assert isinstance(value, str), (
            "mm_encoder_attn_backend must be a string or an AttentionBackendEnum."
        )
        return AttentionBackendEnum[value.upper()]

    @model_validator(mode="after")
    def _validate_multimodal_config(self):
        if self.mm_processor_cache_type != "shm" and (
            self.mm_shm_cache_max_object_size_mb
            != MultiModalConfig.mm_shm_cache_max_object_size_mb
        ):
            raise ValueError(
                "'mm_shm_cache_max_object_size_mb' should only be set when "
                "'mm_processor_cache_type' is 'shm'."
            )
        # Validate FP8 scale path combinations.
        if self.mm_encoder_attn_dtype != "fp8" and (
            self.mm_encoder_fp8_scale_path is not None
            or self.mm_encoder_fp8_scale_save_path is not None
        ):
            raise ValueError(
                "'mm_encoder_fp8_scale_path' and "
                "'mm_encoder_fp8_scale_save_path' require "
                "'mm_encoder_attn_dtype' to be 'fp8'."
            )
        if (
            self.mm_encoder_fp8_scale_path is not None
            and self.mm_encoder_fp8_scale_save_path is not None
        ):
            raise ValueError(
                "'mm_encoder_fp8_scale_save_path' cannot be used with "
                "'mm_encoder_fp8_scale_path' (saving requires dynamic scaling)."
            )

        # Validate file paths exist.
        if self.mm_encoder_fp8_scale_path is not None:
            scale_path = Path(self.mm_encoder_fp8_scale_path)
            if not scale_path.is_file():
                raise FileNotFoundError(f"FP8 scale file not found: {scale_path}")
        if self.mm_encoder_fp8_scale_save_path is not None:
            save_parent = Path(self.mm_encoder_fp8_scale_save_path).parent
            if not save_parent.is_dir():
                raise FileNotFoundError(
                    f"Parent directory for FP8 scale save path not found: {save_parent}"
                )
        return self

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        factors: list[Any] = [
            self.mm_encoder_attn_backend.name
            if self.mm_encoder_attn_backend is not None
            else None,
            self.mm_encoder_tp_mode,
            self.mm_encoder_attn_dtype,
            self.mm_encoder_fp8_scale_path,
        ]
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def get_limit_per_prompt(self, modality: str) -> int:
        """
        Get the maximum number of input items allowed per prompt
        for the given modality (backward compatible).
        """
        if self.language_model_only:
            return 0

        limit_data = self.limit_per_prompt.get(modality)

        if limit_data is None:
            # Unspecified modality is set to 999 by default
            return 999

        return limit_data.count

    def merge_mm_processor_kwargs(
        self,
        inference_kwargs: Mapping[str, object],
    ) -> dict[str, object]:
        """
        Get the keyword arguments to pass to the multi-modal processor
        according to the extra arguments passed during inference.
        """
        kwargs = self.mm_processor_kwargs or {}
        return kwargs | dict(inference_kwargs)

    def is_multimodal_pruning_enabled(self):
        return self.video_pruning_rate is not None and self.video_pruning_rate > 0

enable_mm_embeds = False class-attribute instance-attribute

If True, enables passing multimodal embeddings: for LLM class, this refers to tensor inputs under multi_modal_data; for the OpenAI-compatible server, this refers to chat messages with content "type": "*_embeds".

When enabled with --limit-mm-per-prompt set to 0 for a modality, precomputed embeddings skip count validation for that modality, saving memory by not loading encoder modules while still enabling embeddings as an input. Limits greater than 0 still apply to embeddings.

WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users!

interleave_mm_strings = False class-attribute instance-attribute

Enable fully interleaved support for multimodal prompts, while using --chat-template-content-format=string.

language_model_only = False class-attribute instance-attribute

If True, disables all multimodal inputs by setting all modality limits to 0. Equivalent to setting --limit-mm-per-prompt to 0 for every modality.

limit_per_prompt = Field(default_factory=dict) class-attribute instance-attribute

The maximum number of input items and options allowed per prompt for each modality.

Defaults to 999 for each modality.

Legacy format (count only):

Configurable format (with options): {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, "image": {"count": 5, "width": 512, "height": 512}}

Mixed format (combining both): {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}}

media_io_kwargs = Field(default_factory=dict) class-attribute instance-attribute

Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set --media-io-kwargs '{"video": {"num_frames": 40} }'

mm_encoder_attn_backend = None class-attribute instance-attribute

Optional override for the multi-modal encoder attention backend when using vision transformers. Accepts any value from vllm.v1.attention.backends.registry.AttentionBackendEnum (e.g. FLASH_ATTN).

mm_encoder_attn_dtype = None class-attribute instance-attribute

Optional dtype override for ViT encoder attention. Set to "fp8" to enable FP8 quantization via the FlashInfer cuDNN backend. When set to "fp8" without a scale file, dynamic scaling is used automatically. See docs/features/quantization/fp8_vit_attn.md for details.

mm_encoder_fp8_scale_path = None class-attribute instance-attribute

Path to a JSON file containing per-layer FP8 Q/K/V scales for ViT encoder attention. When provided (with mm_encoder_attn_dtype="fp8"), static scaling is used. When omitted, dynamic scaling is used.

mm_encoder_fp8_scale_save_margin = Field(default=1.5, gt=0.0) class-attribute instance-attribute

Safety margin multiplied onto scales when auto-saving. A value > 1 leaves headroom so that inputs with larger activations than the calibration set do not overflow FP8 range. Default 1.5.

mm_encoder_fp8_scale_save_path = None class-attribute instance-attribute

When set with dynamic FP8 scaling (mm_encoder_attn_dtype="fp8" and no mm_encoder_fp8_scale_path), saves the calibrated scales to this file after the amax history buffer is full. The saved file can then be used as mm_encoder_fp8_scale_path in subsequent runs.

mm_encoder_only = False class-attribute instance-attribute

When enabled, skips the language component of the model.

This is usually only valid in disaggregated Encoder process.

mm_encoder_tp_mode = 'weights' class-attribute instance-attribute

Indicates how to optimize multi-modal encoder inference using tensor parallelism (TP).

  • "weights": Within the same vLLM engine, split the weights of each layer across TP ranks. (default TP behavior)
  • "data": Within the same vLLM engine, split the batched input data across TP ranks to process the data in parallel, while hosting the full weights on each TP rank. This batch-level DP is not to be confused with API request-level DP (which is controlled by --data-parallel-size). This is only supported on a per-model basis and falls back to "weights" if the encoder does not support DP.

mm_ipc_gpu_memory_gb = Field(default=0, ge=0) class-attribute instance-attribute

Amount of GPU memory (in GiB) sequestered on the engine's device for GPU-side multimodal work in the API-server (frontend) process, such as hardware video decoding.

This budget is carved out of the engine's KV-cache memory so the headroom physically exists, and frontend GPU decode paths acquire from a blocking byte-counting semaphore of this size before allocating on the device.

Set to 0 (default) to disable frontend GPU multimodal memory gating.

mm_processor_cache_gb = Field(default=4, ge=0) class-attribute instance-attribute

The size (in GiB) of the multi-modal processor cache, which is used to avoid re-processing past multi-modal inputs.

This cache is duplicated for each API process and engine core process, resulting in a total memory usage of mm_processor_cache_gb * (api_server_count + data_parallel_size).

Set to 0 to disable this cache completely (not recommended).

mm_processor_cache_type = 'lru' class-attribute instance-attribute

Type of cache to use for the multi-modal preprocessor/mapper. If shm, use shared memory FIFO cache. If lru, use mirrored LRU cache.

mm_processor_kwargs = None class-attribute instance-attribute

Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained from transformers.AutoProcessor.from_pretrained.

The available overrides depend on the model that is being run.

For example, for Phi-3-Vision: {"num_crops": 4}.

mm_shm_cache_max_object_size_mb = Field(default=128, ge=0) class-attribute instance-attribute

Size limit (in MiB) for each object stored in the multi-modal processor shared memory cache. Only effective when mm_processor_cache_type is "shm".

mm_tensor_ipc = 'direct_rpc' class-attribute instance-attribute

IPC (inter-process communication) method for multimodal tensors. - "direct_rpc": Use msgspec serialization via RPC - "torch_shm": Use torch.multiprocessing shared memory for zero-copy IPC Defaults to "direct_rpc".

skip_mm_profiling = False class-attribute instance-attribute

When enabled, skips multimodal memory profiling and only profiles with language backbone model during engine initialization.

This reduces engine startup time but shifts the responsibility to users for estimating the peak memory usage of the activation of multimodal encoder and embedding cache.

video_pruning_rate = Field(default=None, ge=0.0, lt=1.0) class-attribute instance-attribute

Sets pruning rate for video pruning via Efficient Video Sampling. Value sits in range [0;1) and determines fraction of media tokens from each video to be pruned.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/multimodal.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    factors: list[Any] = [
        self.mm_encoder_attn_backend.name
        if self.mm_encoder_attn_backend is not None
        else None,
        self.mm_encoder_tp_mode,
        self.mm_encoder_attn_dtype,
        self.mm_encoder_fp8_scale_path,
    ]
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

get_limit_per_prompt(modality)

Get the maximum number of input items allowed per prompt for the given modality (backward compatible).

Source code in vllm/config/multimodal.py
def get_limit_per_prompt(self, modality: str) -> int:
    """
    Get the maximum number of input items allowed per prompt
    for the given modality (backward compatible).
    """
    if self.language_model_only:
        return 0

    limit_data = self.limit_per_prompt.get(modality)

    if limit_data is None:
        # Unspecified modality is set to 999 by default
        return 999

    return limit_data.count

merge_mm_processor_kwargs(inference_kwargs)

Get the keyword arguments to pass to the multi-modal processor according to the extra arguments passed during inference.

Source code in vllm/config/multimodal.py
def merge_mm_processor_kwargs(
    self,
    inference_kwargs: Mapping[str, object],
) -> dict[str, object]:
    """
    Get the keyword arguments to pass to the multi-modal processor
    according to the extra arguments passed during inference.
    """
    kwargs = self.mm_processor_kwargs or {}
    return kwargs | dict(inference_kwargs)

ObservabilityConfig

Configuration for observability - metrics and tracing.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

Source code in vllm/config/observability.py
@config
class ObservabilityConfig:
    """Configuration for observability - metrics and tracing."""

    show_hidden_metrics_for_version: str | None = None
    """Enable deprecated Prometheus metrics that have been hidden since the
    specified version. For example, if a previously deprecated metric has been
    hidden since the v0.7.0 release, you use
    `--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
    you migrate to new metrics. The metric is likely to be removed completely
    in an upcoming release."""

    @cached_property
    def show_hidden_metrics(self) -> bool:
        """Check if the hidden metrics should be shown."""
        if self.show_hidden_metrics_for_version is None:
            return False
        return version._prev_minor_version_was(self.show_hidden_metrics_for_version)

    otlp_traces_endpoint: str | None = None
    """Target URL to which OpenTelemetry traces will be sent."""

    collect_detailed_traces: list[DetailedTraceModules] | None = None
    """It makes sense to set this only if `--otlp-traces-endpoint` is set. If
    set, it will collect detailed traces for the specified modules. This
    involves use of possibly costly and or blocking operations and hence might
    have a performance impact.

    Note that collecting detailed timing information for each request can be
    expensive."""

    kv_cache_metrics: bool = False
    """Enable KV cache residency metrics (lifetime, idle time, reuse gaps).
    Uses sampling to minimize overhead.
    Requires log stats to be enabled (i.e., --disable-log-stats not set)."""

    kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
    """Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""

    cudagraph_metrics: bool = False
    """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
    dispatch modes, and their observed frequencies at every logging interval)."""

    enable_layerwise_nvtx_tracing: bool = False
    """Enable layerwise NVTX tracing. This traces the execution of each layer or
    module in the model and attach information such as input/output shapes to
    nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""

    enable_mfu_metrics: bool = False
    """Enable Model FLOPs Utilization (MFU) metrics."""

    enable_mm_processor_stats: bool = False
    """Enable collection of timing statistics for multimodal processor operations.
    This is for internal use only (e.g., benchmarks) and is not exposed as a CLI
    argument."""

    enable_logging_iteration_details: bool = False
    """Enable detailed logging of iteration details.
    If set, vllm EngineCore will log iteration details
    This includes number of context/generation requests and tokens
    and the elapsed cpu time for the iteration."""

    jit_monitor_mode: Literal["warn", "error"] = "warn"
    """How to handle post-warmup JIT compilation events."""

    jit_monitor_verbose: bool = False
    """Log every monitored JIT compile with runtime details. This can emit many
    logs and add overhead, so it is intended for debugging."""

    @cached_property
    def collect_model_forward_time(self) -> bool:
        """Whether to collect model forward time for the request."""
        return self.collect_detailed_traces is not None and (
            "model" in self.collect_detailed_traces
            or "all" in self.collect_detailed_traces
        )

    @cached_property
    def collect_model_execute_time(self) -> bool:
        """Whether to collect model execute time for the request."""
        return self.collect_detailed_traces is not None and (
            "worker" in self.collect_detailed_traces
            or "all" in self.collect_detailed_traces
        )

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @field_validator("show_hidden_metrics_for_version")
    @classmethod
    def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None:
        if value is not None:
            # Raises an exception if the string is not a valid version.
            parse(value)
        return value

    @field_validator("otlp_traces_endpoint")
    @classmethod
    def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None:
        if value is not None:
            from vllm.tracing import is_tracing_available, otel_import_error_traceback

            if not is_tracing_available():
                raise ValueError(
                    "OpenTelemetry is not available. Unable to configure "
                    "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
                    f"installed. Original error:\n{otel_import_error_traceback}"
                )
        return value

    @field_validator("collect_detailed_traces")
    @classmethod
    def _validate_collect_detailed_traces(
        cls, value: list[DetailedTraceModules] | None
    ) -> list[DetailedTraceModules] | None:
        """Handle the legacy case where users might provide a comma-separated
        string instead of a list of strings."""
        if value is not None and len(value) == 1 and "," in value[0]:
            value = cast(list[DetailedTraceModules], value[0].split(","))
        return value

    @model_validator(mode="after")
    def _validate_tracing_config(self):
        if self.collect_detailed_traces and not self.otlp_traces_endpoint:
            raise ValueError(
                "collect_detailed_traces requires `--otlp-traces-endpoint` to be set."
            )
        return self

collect_detailed_traces = None class-attribute instance-attribute

It makes sense to set this only if --otlp-traces-endpoint is set. If set, it will collect detailed traces for the specified modules. This involves use of possibly costly and or blocking operations and hence might have a performance impact.

Note that collecting detailed timing information for each request can be expensive.

collect_model_execute_time cached property

Whether to collect model execute time for the request.

collect_model_forward_time cached property

Whether to collect model forward time for the request.

cudagraph_metrics = False class-attribute instance-attribute

Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph dispatch modes, and their observed frequencies at every logging interval).

enable_layerwise_nvtx_tracing = False class-attribute instance-attribute

Enable layerwise NVTX tracing. This traces the execution of each layer or module in the model and attach information such as input/output shapes to nvtx range markers. Noted that this doesn't work with CUDA graphs enabled.

enable_logging_iteration_details = False class-attribute instance-attribute

Enable detailed logging of iteration details. If set, vllm EngineCore will log iteration details This includes number of context/generation requests and tokens and the elapsed cpu time for the iteration.

enable_mfu_metrics = False class-attribute instance-attribute

Enable Model FLOPs Utilization (MFU) metrics.

enable_mm_processor_stats = False class-attribute instance-attribute

Enable collection of timing statistics for multimodal processor operations. This is for internal use only (e.g., benchmarks) and is not exposed as a CLI argument.

jit_monitor_mode = 'warn' class-attribute instance-attribute

How to handle post-warmup JIT compilation events.

jit_monitor_verbose = False class-attribute instance-attribute

Log every monitored JIT compile with runtime details. This can emit many logs and add overhead, so it is intended for debugging.

kv_cache_metrics = False class-attribute instance-attribute

Enable KV cache residency metrics (lifetime, idle time, reuse gaps). Uses sampling to minimize overhead. Requires log stats to be enabled (i.e., --disable-log-stats not set).

kv_cache_metrics_sample = Field(default=0.01, gt=0, le=1) class-attribute instance-attribute

Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks.

otlp_traces_endpoint = None class-attribute instance-attribute

Target URL to which OpenTelemetry traces will be sent.

show_hidden_metrics cached property

Check if the hidden metrics should be shown.

show_hidden_metrics_for_version = None class-attribute instance-attribute

Enable deprecated Prometheus metrics that have been hidden since the specified version. For example, if a previously deprecated metric has been hidden since the v0.7.0 release, you use --show-hidden-metrics-for-version=0.7 as a temporary escape hatch while you migrate to new metrics. The metric is likely to be removed completely in an upcoming release.

_validate_collect_detailed_traces(value) classmethod

Handle the legacy case where users might provide a comma-separated string instead of a list of strings.

Source code in vllm/config/observability.py
@field_validator("collect_detailed_traces")
@classmethod
def _validate_collect_detailed_traces(
    cls, value: list[DetailedTraceModules] | None
) -> list[DetailedTraceModules] | None:
    """Handle the legacy case where users might provide a comma-separated
    string instead of a list of strings."""
    if value is not None and len(value) == 1 and "," in value[0]:
        value = cast(list[DetailedTraceModules], value[0].split(","))
    return value

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/observability.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

OffloadConfig

Configuration for model weight offloading to reduce GPU memory usage.

Methods:

Attributes:

Source code in vllm/config/offload.py
@config
class OffloadConfig:
    """Configuration for model weight offloading to reduce GPU memory usage."""

    offload_backend: OffloadBackend = "auto"
    """The backend for weight offloading. Options:
    - "auto": Selects based on which sub-config has non-default values
      (prefetch if offload_group_size > 0, uva if cpu_offload_gb > 0).
    - "uva": UVA (Unified Virtual Addressing) zero-copy offloading.
    - "prefetch": Async prefetch with group-based layer offloading.
    """

    uva: UVAOffloadConfig = Field(default_factory=UVAOffloadConfig)
    """Parameters for UVA offloading backend."""

    prefetch: PrefetchOffloadConfig = Field(default_factory=PrefetchOffloadConfig)
    """Parameters for prefetch offloading backend."""

    @model_validator(mode="after")
    def validate_offload_config(self) -> "OffloadConfig":
        """Validate offload configuration constraints."""
        if self.offload_backend == "prefetch" or self.prefetch.offload_group_size > 0:
            if self.prefetch.offload_num_in_group > self.prefetch.offload_group_size:
                raise ValueError(
                    f"offload_num_in_group ({self.prefetch.offload_num_in_group})"
                    f" must be <= offload_group_size"
                    f" ({self.prefetch.offload_group_size})"
                )
            if self.prefetch.offload_prefetch_step < 1:
                raise ValueError(
                    f"offload_prefetch_step"
                    f" ({self.prefetch.offload_prefetch_step})"
                    f" must be >= 1 when prefetch offloading is enabled"
                    f" (offload_group_size > 0)"
                )

        # Warn if both backends have non-default values
        uva_active = self.uva.cpu_offload_gb > 0
        prefetch_active = self.prefetch.offload_group_size > 0
        if self.offload_backend == "uva" and prefetch_active:
            warnings.warn(
                "Prefetch offload fields are set but offload_backend='uva'. "
                "Prefetch settings will be ignored.",
                stacklevel=2,
            )
        elif self.offload_backend == "prefetch" and uva_active:
            warnings.warn(
                "UVA offload fields are set but offload_backend='prefetch'. "
                "UVA settings will be ignored.",
                stacklevel=2,
            )
        elif self.offload_backend == "auto" and uva_active and prefetch_active:
            warnings.warn(
                "Both UVA and prefetch offload fields are set with "
                "offload_backend='auto'. Prefetch backend will be selected. "
                "Set offload_backend explicitly to suppress this warning.",
                stacklevel=2,
            )
        return self

    def compute_hash(self) -> str:
        """
        Provide a hash that uniquely identifies all the offload configs.

        All fields are included because PrefetchOffloader patches module
        forwards and inserts custom ops (wait_prefetch, start_prefetch)
        into the computation graph. Changing any offload setting can
        alter which layers are hooked and how prefetch indices are
        computed, so the compilation cache must distinguish them.
        """
        from vllm.config.utils import get_hash_factors, hash_factors

        factors = get_hash_factors(self, ignored_factors=set())
        hash_str = hash_factors(factors)
        return hash_str

offload_backend = 'auto' class-attribute instance-attribute

The backend for weight offloading. Options: - "auto": Selects based on which sub-config has non-default values (prefetch if offload_group_size > 0, uva if cpu_offload_gb > 0). - "uva": UVA (Unified Virtual Addressing) zero-copy offloading. - "prefetch": Async prefetch with group-based layer offloading.

prefetch = Field(default_factory=PrefetchOffloadConfig) class-attribute instance-attribute

Parameters for prefetch offloading backend.

uva = Field(default_factory=UVAOffloadConfig) class-attribute instance-attribute

Parameters for UVA offloading backend.

compute_hash()

Provide a hash that uniquely identifies all the offload configs.

All fields are included because PrefetchOffloader patches module forwards and inserts custom ops (wait_prefetch, start_prefetch) into the computation graph. Changing any offload setting can alter which layers are hooked and how prefetch indices are computed, so the compilation cache must distinguish them.

Source code in vllm/config/offload.py
def compute_hash(self) -> str:
    """
    Provide a hash that uniquely identifies all the offload configs.

    All fields are included because PrefetchOffloader patches module
    forwards and inserts custom ops (wait_prefetch, start_prefetch)
    into the computation graph. Changing any offload setting can
    alter which layers are hooked and how prefetch indices are
    computed, so the compilation cache must distinguish them.
    """
    from vllm.config.utils import get_hash_factors, hash_factors

    factors = get_hash_factors(self, ignored_factors=set())
    hash_str = hash_factors(factors)
    return hash_str

validate_offload_config()

Validate offload configuration constraints.

Source code in vllm/config/offload.py
@model_validator(mode="after")
def validate_offload_config(self) -> "OffloadConfig":
    """Validate offload configuration constraints."""
    if self.offload_backend == "prefetch" or self.prefetch.offload_group_size > 0:
        if self.prefetch.offload_num_in_group > self.prefetch.offload_group_size:
            raise ValueError(
                f"offload_num_in_group ({self.prefetch.offload_num_in_group})"
                f" must be <= offload_group_size"
                f" ({self.prefetch.offload_group_size})"
            )
        if self.prefetch.offload_prefetch_step < 1:
            raise ValueError(
                f"offload_prefetch_step"
                f" ({self.prefetch.offload_prefetch_step})"
                f" must be >= 1 when prefetch offloading is enabled"
                f" (offload_group_size > 0)"
            )

    # Warn if both backends have non-default values
    uva_active = self.uva.cpu_offload_gb > 0
    prefetch_active = self.prefetch.offload_group_size > 0
    if self.offload_backend == "uva" and prefetch_active:
        warnings.warn(
            "Prefetch offload fields are set but offload_backend='uva'. "
            "Prefetch settings will be ignored.",
            stacklevel=2,
        )
    elif self.offload_backend == "prefetch" and uva_active:
        warnings.warn(
            "UVA offload fields are set but offload_backend='prefetch'. "
            "UVA settings will be ignored.",
            stacklevel=2,
        )
    elif self.offload_backend == "auto" and uva_active and prefetch_active:
        warnings.warn(
            "Both UVA and prefetch offload fields are set with "
            "offload_backend='auto'. Prefetch backend will be selected. "
            "Set offload_backend explicitly to suppress this warning.",
            stacklevel=2,
        )
    return self

ParallelConfig

Configuration for the distributed execution.

Methods:

Attributes:

Source code in vllm/config/parallel.py
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
@config
class ParallelConfig:
    """Configuration for the distributed execution."""

    pipeline_parallel_size: int = Field(default=1, ge=1)
    """Number of pipeline parallel groups."""
    tensor_parallel_size: int = Field(default=1, ge=1)
    """Number of tensor parallel groups."""
    prefill_context_parallel_size: int = Field(default=1, ge=1)
    """Number of prefill context parallel groups."""
    data_parallel_size: int = Field(default=1, ge=1)
    """Number of data parallel groups. MoE layers will be sharded according to
    the product of the tensor parallel size and data parallel size."""
    data_parallel_size_local: int = Field(default=1, ge=0)
    """Number of local data parallel groups. A value of 0 is a sentinel used by
    the engine-args layer to signal that data parallelism was specified
    externally (see `ParallelConfig.__post_init__`)."""
    data_parallel_rank: int = Field(default=0, ge=0)
    """Rank of the data parallel group. The runtime check at
    ``__post_init__`` further bounds this by ``data_parallel_size``."""
    data_parallel_rank_local: int | None = None
    """Local rank of the data parallel group, set only in SPMD mode."""
    data_parallel_master_ip: str = "127.0.0.1"
    """IP of the data parallel master."""
    data_parallel_rpc_port: int = 29550
    """Port for data parallel messaging."""
    data_parallel_master_port: int = 29500
    """Port of the data parallel master."""
    data_parallel_backend: DataParallelBackend = "mp"
    """Backend to use for data parallel, either "mp" or "ray"."""
    data_parallel_external_lb: bool = False
    """Whether to use "external" DP LB mode. Applies only to online serving
    and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
    wide-EP setup in Kubernetes. Supported only for MoE deployments; non-MoE
    models should use independent vLLM instances without --data-parallel-*
    arguments. Set implicitly when --data-parallel-rank is provided explicitly
    to vllm serve."""
    data_parallel_hybrid_lb: bool = False
    """Whether to use "hybrid" DP LB mode. Applies only to online serving
    and when data_parallel_size > 0. Enables running an AsyncLLM
    and API server on a "per-node" basis where vLLM load balances
    between local data parallel ranks, but an external LB balances
    between vLLM nodes/replicas. Set explicitly in conjunction with
    --data-parallel-start-rank."""
    is_moe_model: bool | None = None
    """Whether the deployed model is MoE (if known)."""
    enable_expert_parallel: bool = False
    """Use expert parallelism instead of tensor parallelism for MoE layers."""
    enable_ep_weight_filter: bool = False
    """Skip non-local expert weights during model loading when expert
    parallelism is active.  Each rank only reads its own expert shard from
    disk, which can drastically reduce storage I/O for MoE models with
    per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5).  Has no
    effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE
    models."""
    enable_eplb: bool = False
    """Enable expert parallelism load balancing for MoE layers."""
    eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
    """Expert parallelism configuration."""
    expert_placement_strategy: ExpertPlacementStrategy = "linear"
    """The expert placement strategy for MoE layers:

    - "linear": Experts are placed in a contiguous manner. For example, with 4
      experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
      experts [2, 3].
    - "round_robin": Experts are placed in a round-robin manner. For example,
      with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
      will have experts [1, 3]. This strategy can help improve load balancing
      for grouped expert models with no redundant experts."""
    all2all_backend: All2AllBackend = "allgather_reducescatter"
    """All2All backend for MoE expert parallel communication. Available options:

    - "allgather_reducescatter": All2all based on allgather and reducescatter
    - "deepep_high_throughput": Use deepep high-throughput kernels
    - "deepep_low_latency": Use deepep low-latency kernels
    - "mori_high_throughput": MoRI EP with InterNodeV1 for multi-node
    - "mori_low_latency": MoRI EP with InterNodeV1LL for multi-node
    - "nixl_ep": Use nixl-ep kernels
    - "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
    - "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""

    max_parallel_loading_workers: int | None = Field(default=None, ge=1)
    """Maximum number of parallel loading workers when loading model
    sequentially in multiple batches. To avoid RAM OOM when using tensor
    parallel and large models."""

    disable_custom_all_reduce: bool = False
    """Disable the custom all-reduce kernel and fall back to NCCL."""

    enable_elastic_ep: bool = False
    """Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""

    enable_dbo: bool = False
    """Enable dual batch overlap for the model executor."""
    ubatch_size: int = Field(default=0, ge=0)
    """Number of ubatch size."""

    dbo_decode_token_threshold: int = Field(default=32, ge=0)
    """The threshold for dual batch overlap for batches only containing decodes.
    If the number of tokens in the request is greater than this threshold,
    microbatching will be used. Otherwise, the request will be processed in a
    single batch."""
    dbo_prefill_token_threshold: int = Field(default=512, ge=0)  # TODO(lucas): tune
    """The threshold for dual batch overlap for batches that contain one or more
    prefills. If the number of tokens in the request is greater than this
    threshold, microbatching will be used. Otherwise, the request will be
    processed in a single batch."""

    disable_nccl_for_dp_synchronization: bool | None = None
    """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
    to use Gloo instead of NCCL for its all reduce.

    Defaults to True when async scheduling is enabled, False otherwise.
    """

    ray_workers_use_nsight: bool = False
    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""

    ray_runtime_env: RuntimeEnv | None = None
    """Ray runtime environment to pass to distributed workers."""

    placement_group: PlacementGroup | None = None
    """ray distributed model workers placement group."""

    distributed_executor_backend: (
        str | DistributedExecutorBackend | type[Executor] | None
    ) = None
    """
    Backend to use for distributed model workers, either "ray" or "mp"
    (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size
    is less than or equal to the number of GPUs available, "mp" will be used to
    keep processing on a single host. Otherwise, an error will be raised. To use "mp"
    you must also set nnodes, and to use "ray" you must manually set
    distributed_executor_backend to "ray".

    Note:
        [TPU](https://docs.vllm.ai/projects/tpu/en/latest/) platform only supports Ray
        for distributed inference.
    """

    worker_cls: str = "auto"
    """The full name of the worker class to use. If "auto", the worker class
    will be determined based on the platform."""
    sd_worker_cls: str = "auto"
    """The full name of the worker class to use for speculative decoding.
    If "auto", the worker class will be determined based on the platform."""
    worker_extension_cls: str = ""
    """The full name of the worker extension class to use. The worker extension
    class is dynamically inherited by the worker class. This is used to inject
    new attributes and methods to the worker class for use in collective_rpc
    calls."""
    master_addr: str = "127.0.0.1"
    """distributed master address for multi-node distributed 
    inference when distributed_executor_backend is mp."""
    master_port: int = 29501
    """distributed master port for multi-node distributed 
    inference when distributed_executor_backend is mp."""
    node_rank: int = Field(default=0, ge=0)
    """distributed node rank for multi-node distributed
    inference when distributed_executor_backend is mp."""
    nnodes: int = Field(default=1, ge=1)
    """num of nodes for multi-node distributed
    inference when distributed_executor_backend is mp."""
    numa_bind: bool = False
    """Enable NUMA binding for GPU worker subprocesses.

    By default, workers are pinned to their GPU's NUMA-local CPUs and
    memory; on PCT-capable Xeons they also auto-bind to the SKU's
    PCT priority cores.
    """
    numa_bind_nodes: list[int] | None = None
    """NUMA node to bind each GPU worker to.

    Specify one NUMA node per visible GPU, for example `[0, 0, 1, 1]`
    for a 4-GPU system with GPUs 0-1 on NUMA node 0 and GPUs 2-3 on
    NUMA node 1. If unset and `numa_bind=True`, vLLM auto-detects the
    GPU-to-NUMA topology. The values are passed to `numactl --membind`
    and `--cpunodebind`, so they must be valid `numactl` NUMA node indices.
    """
    numa_bind_cpus: list[str] | None = None
    """Optional CPU lists to bind each GPU worker to.

    Specify one CPU list per visible GPU, for example
    `["0-3", "4-7", "8-11", "12-15"]`. When set, vLLM uses
    `numactl --physcpubind` instead of `--cpunodebind`. This is useful
    for custom policies such as binding to PCT or other high-frequency cores.
    Each entry must use `numactl --physcpubind` CPU-list syntax, for example
    `"0-3"` or `"0,2,4-7"`.
    """
    assigned_physical_gpu_ids: list[int] | None = None
    """Mapping from vLLM-local logical GPU IDs to physical GPU IDs.

    For example, ``[2, 3]`` means logical GPU 0 maps to physical GPU 2,
    and logical GPU 1 maps to physical GPU 3. Physical IDs are used only
    at platform/topology boundaries such as NVML, NIC affinity, P2P
    checks, and final CUDA device selection when needed. When None,
    logical IDs map to visible device IDs in order."""

    distributed_timeout_seconds: int | None = None
    """Timeout in seconds for distributed operations (e.g., init_process_group).
    If set, this value is passed to torch.distributed.init_process_group as the
    timeout parameter. If None, PyTorch's default timeout is used (600s for NCCL).
    Increase this for multi-node setups where model downloads may be slow."""

    cpu_distributed_timeout_seconds: int | None = None
    """Timeout (in seconds) for cpu communication groups. If None, PyTorch's
    default timeout is used (1800s for gloo)."""

    world_size: int = Field(init=False)
    """world_size is TPxPP, it affects the number of workers we create."""

    rank: int = 0
    """Global rank in distributed setup."""

    _data_parallel_master_port_list: list[int] = Field(default_factory=list)
    """List of open port auto-queried for data parallel messaging.
    Set to be private as it's not intended to be configured by users.
    """

    _coord_store_port: int = 0
    """Port of the coordination TCPStore. Can be set by the API server; workers
    connect as clients to exchange self-picked group ports at runtime."""

    decode_context_parallel_size: int = Field(default=1, ge=1)
    """Number of decode context parallel groups, because the world size does
    not change by dcp, it simply reuse the GPUs of TP group, and tp_size
    needs to be divisible by dcp_size."""

    dcp_kv_cache_interleave_size: int = 1
    """
    Interleave size of kv_cache storage while using DCP.
    dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
    and will be deprecated when PCP is fully supported.

    """
    dcp_comm_backend: DCPCommBackend = "ag_rs"
    """Communication backend for Decode Context Parallel (DCP).
    - "ag_rs": AllGather + ReduceScatter (default, existing behavior)
    - "a2a": All-to-All exchange of partial outputs + LSE, then
      combine with Triton kernel. Reduces NCCL calls from 3 to 2
      per layer for MLA models.
    """

    cp_kv_cache_interleave_size: int = 1
    """Interleave size of kv_cache storage while using DCP or PCP.
    For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
        and `total_cp_world_size = pcp_world_size * dcp_world_size`.
    store interleave_size tokens on total_cp_rank i,
    then store next interleave_size tokens on total_cp_rank i+1.
    Interleave_size=1: token-level alignment, where token `i` is stored on
        total_cp_rank `i % total_cp_world_size`.
    Interleave_size=block_size: block-level alignment, where tokens are
        first populated to the preceding ranks. Tokens are then stored
        in (rank i+1, block j) only after (rank i, block j) is fully occupied.
    Block_size should be greater than or equal to cp_kv_cache_interleave_size.
    Block_size should be divisible by cp_kv_cache_interleave_size.
    """

    data_parallel_index: int = Field(init=False)
    """Equal to the data parallel rank but not used for torch process groups
    and not overridden for dense models."""

    _api_process_count: int = Field(default=1, gt=0)
    """
    The number of API processes initialized.

    Note:
        This is an internal config that is only valid for and
        should only be set by API server scale-out.
    """

    _api_process_rank: int = Field(default=0, ge=-1)
    """
    The rank of this API process, or `-1` for engine core processes
    under API server scale-out.

    Note:
        This is an internal config that is only valid for and
        should only be set by API server scale-out.
    """

    @field_validator("disable_nccl_for_dp_synchronization", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialisation is delayed."""
        return None if value is None else handler(value)

    @field_validator("numa_bind_nodes")
    @classmethod
    def _validate_numa_bind_nodes(cls, value: list[int] | None) -> list[int] | None:
        if value is None:
            return None
        if not value:
            raise ValueError("numa_bind_nodes must not be empty.")
        if any(node < 0 for node in value):
            raise ValueError("numa_bind_nodes must contain non-negative integers.")
        return value

    @field_validator("numa_bind_cpus")
    @classmethod
    def _validate_numa_bind_cpus(cls, value: list[str] | None) -> list[str] | None:
        if value is None:
            return None
        if not value:
            raise ValueError("numa_bind_cpus must not be empty.")

        for cpuset in value:
            if not cpuset:
                raise ValueError("numa_bind_cpus entries must not be empty.")
            if not _NUMACTL_CPUSET_PATTERN.fullmatch(cpuset):
                raise ValueError(
                    "numa_bind_cpus entries must use numactl CPU list syntax, "
                    "for example '0-3' or '0,2,4-7'."
                )
            for part in cpuset.split(","):
                if "-" not in part:
                    continue
                start_str, end_str = part.split("-", 1)
                if int(start_str) > int(end_str):
                    raise ValueError(
                        f"numa_bind_cpus ranges must be ascending, but got '{cpuset}'."
                    )
        return value

    @model_validator(mode="after")
    def _validate_parallel_config(self) -> Self:
        if self._api_process_rank >= self._api_process_count:
            raise ValueError(
                "Invalid value of `_api_process_rank`. "
                f"Expected to be `-1` or `[0, {self._api_process_count})`, "
                f"but found: {self._api_process_rank}"
            )

        if self.all2all_backend in ["pplx", "naive"]:
            logger.warning(
                "The '%s' all2all backend has been removed. "
                "Falling back to 'allgather_reducescatter'.",
                self.all2all_backend,
            )
            self.all2all_backend = "allgather_reducescatter"

        if self.data_parallel_size_local > self.data_parallel_size:
            raise ValueError(
                f"data_parallel_size_local ({self.data_parallel_size_local}) "
                f"must be <= data_parallel_size ({self.data_parallel_size})"
            )

        if self.data_parallel_size <= 1 and self.data_parallel_external_lb:
            raise ValueError(
                "data_parallel_external_lb can only be set when data_parallel_size > 1"
            )

        if not self.numa_bind and (
            self.numa_bind_nodes is not None or self.numa_bind_cpus is not None
        ):
            raise ValueError(
                "numa_bind_nodes and numa_bind_cpus require numa_bind=True."
            )

        if self.enable_eplb:
            if not current_platform.is_cuda_alike():
                raise ValueError(
                    "Expert parallelism load balancing is only supported on "
                    "CUDA devices or ROCm devices now."
                )
            if not self.enable_expert_parallel:
                raise ValueError("enable_expert_parallel must be True to use EPLB.")
            if self.tensor_parallel_size * self.data_parallel_size <= 1:
                raise ValueError(
                    "EPLB requires tensor_parallel_size or data_parallel_size "
                    f"to be greater than 1, but got "
                    f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
                )
        else:
            if self.eplb_config.num_redundant_experts != 0:
                raise ValueError(
                    "num_redundant_experts is set to "
                    f"{self.eplb_config.num_redundant_experts} but EPLB is not "
                    "enabled. Either enable EPLB or unset "
                    "num_redundant_experts."
                )

        # Note(hc): In the current implementation of decode context
        # parallel(DCP), tp_size needs to be divisible by dcp_size,
        # because the world size does not change by dcp, it simply
        # reuses the GPUs of TP group, and split one TP group into
        # tp_size//dcp_size DCP groups.
        if self.tensor_parallel_size % self.decode_context_parallel_size != 0:
            raise ValueError(
                f"tp_size={self.tensor_parallel_size} must be divisible by"
                f"dcp_size={self.decode_context_parallel_size}."
            )

        if self.dcp_comm_backend == "a2a" and self.decode_context_parallel_size <= 1:
            raise ValueError(
                "dcp_comm_backend='a2a' requires decode_context_parallel_size > 1."
            )

        return self

    @property
    def world_size_across_dp(self) -> int:
        """world_size_across_dp is TPxPPxDP, it is the size of the world
        including data parallelism."""
        return self.world_size * self.data_parallel_size

    @property
    def use_ubatching(self) -> bool:
        return self.enable_dbo or self.ubatch_size > 1

    @property
    def num_ubatches(self) -> int:
        return 2 if self.enable_dbo else self.ubatch_size

    @property
    def local_engines_only(self) -> bool:
        """
        Client manages local+remote EngineCores in pure internal LB case.
        Client manages local EngineCores in hybrid and external LB case.
        """
        return self.data_parallel_external_lb or self.data_parallel_hybrid_lb

    def get_next_dp_init_port(self) -> int:
        """
        We might need to initialize process groups in multiple
        processes that is related to data parallelism,
        e.g. both in the worker and in the engine, which
        can live in different processes. To avoid port conflicts, we
        pop a new port from the prepared port list each time we need to
        initialize a new process group related to data parallelism.
        """
        if self._data_parallel_master_port_list:
            answer = self._data_parallel_master_port_list.pop()
        else:
            answer = self.data_parallel_master_port
            self.data_parallel_master_port += 1

        return answer

    def _pick_stateless_dp_port(self) -> tuple[int, socket.socket | None]:
        """Return ``(port, listen_socket)`` for DP group init.

        With a coord store, rank 0 binds a socket and publishes the port;
        others read it.  Without one, pops a pre-allocated port and
        returns ``listen_socket=None``.
        """
        if not self._coord_store_port:
            return self.get_next_dp_init_port(), None

        from vllm.distributed.utils import get_cached_tcp_store_client

        store = get_cached_tcp_store_client(
            self.data_parallel_master_ip, self._coord_store_port
        )

        key = "dp_master_port"
        if self.data_parallel_rank == 0:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.bind((self.data_parallel_master_ip, 0))
            s.listen()
            port = s.getsockname()[1]
            store.set(key, str(port).encode())
            return port, s
        else:
            return int(store.get(key).decode()), None

    @overload
    def stateless_init_dp_group(
        self, return_store: Literal[False] = ...
    ) -> ProcessGroup: ...
    @overload
    def stateless_init_dp_group(
        self, return_store: Literal[True] = ...
    ) -> tuple[ProcessGroup, Store]: ...
    def stateless_init_dp_group(
        self, return_store: bool = False
    ) -> ProcessGroup | tuple[ProcessGroup, Store]:
        # NOTE: In high-concurrency scenarios multiple processes
        # can pick the same (currently free) port through a race
        # condition when calling `get_open_port()`. When the first
        # process binds the port the others will subsequently fail
        # with `torch.distributed.DistNetworkError: EADDRINUSE`.
        # To make the initialization more robust we retry a few times
        # with a fresh port whenever this specific error is observed.
        from torch.distributed import DistNetworkError

        from vllm.distributed.utils import (
            stateless_init_torch_distributed_process_group,
        )

        max_retries = 5
        last_exc: Exception | None = None
        for _ in range(max_retries):
            try:
                port, listen_socket = self._pick_stateless_dp_port()
                # use gloo since the engine process might not have cuda device
                return stateless_init_torch_distributed_process_group(
                    self.data_parallel_master_ip,
                    port,
                    self.data_parallel_rank,
                    self.data_parallel_size,
                    backend="gloo",
                    return_store=return_store,
                    listen_socket=listen_socket,
                )
            except DistNetworkError as e:
                # We only want to retry when the root cause is EADDRINUSE.
                if "EADDRINUSE" in str(e):
                    logger.warning("Address already in use. Retrying with a new port.")
                    last_exc = e
                    continue  # try again with a new port
                raise e

        # If we get here all retries have failed.
        assert last_exc is not None
        raise last_exc

    # The all_reduce at the end of attention (during o_proj) means that
    # inputs are replicated across each rank of the tensor parallel group.
    # If using expert-parallelism with DeepEP All2All ops, replicated
    # tokens results in useless duplicate computation and communication.
    #
    # In this case, ensure the input to the experts is sequence parallel
    # to avoid the excess work.
    #
    @property
    def use_sequence_parallel_moe(self) -> bool:
        return (
            self.all2all_backend
            in (
                "allgather_reducescatter",
                "deepep_high_throughput",
                "deepep_low_latency",
                "mori_high_throughput",
                "mori_low_latency",
                "nixl_ep",
            )
            and self.enable_expert_parallel
            and self.tensor_parallel_size > 1
            and self.data_parallel_size > 1
        )

    @property
    def use_batched_dp_moe(self) -> bool:
        return (
            self.all2all_backend
            in (
                "deepep_low_latency",
                "nixl_ep",
            )
            and self.enable_expert_parallel
            and self.data_parallel_size > 1
        )

    @property
    def node_rank_within_dp(self) -> int:
        return self.node_rank % self.nnodes_within_dp

    @property
    def nnodes_within_dp(self) -> int:
        if self.nnodes == 1:
            return 1
        data_parallel_node_size = (
            self.data_parallel_size // self.data_parallel_size_local
        )
        return self.nnodes // data_parallel_node_size

    @property
    def local_world_size(self) -> int:
        return self.world_size // self.nnodes_within_dp

    @staticmethod
    def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
        tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
        # dp rank 0: has_unfinished_seqs=True
        # dp rank 1: has_unfinished_seqs=False
        # aggregated: has_unfinished_seqs=True
        # so this is an OR operation, i.e. MAX in integers
        torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
        aggregated_has_unfinished = bool(tensor.item())
        return aggregated_has_unfinished

    @staticmethod
    def sync_dp_state(
        dp_group: ProcessGroup, has_unfinished: bool, pending_pause: bool
    ) -> tuple[bool, bool]:
        """Combined all-reduce for DP state synchronization.

        Uses a single SUM all-reduce on a 2-element tensor:
          [0] = 1 if this rank has unfinished work, else 0.
                SUM > 0 ≡ logical OR across ranks → any rank has work.
          [1] = 1 if this rank has a pending pause request, else 0.
                SUM == dp_size ≡ all ranks reached pause consensus.

        has_unfinished_global is true if any rank has unfinished work,
        or if some ranks are waiting for a pause consensus.

        Returns:
            (has_unfinished_global, pause_consensus)
        """
        tensor = torch.tensor(
            [int(has_unfinished), int(pending_pause)], dtype=torch.int32, device="cpu"
        )
        torch.distributed.all_reduce(tensor, op=ReduceOp.SUM, group=dp_group)
        dp_size = dp_group.size()
        pause_count = tensor[1].item()
        has_unfinished_global = tensor[0].item() > 0 or pause_count % dp_size != 0
        return has_unfinished_global, pause_count == dp_size

    @staticmethod
    def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
        if kv_cache_memory == -1:
            kv_cache_memory = torch.iinfo(torch.int64).max
        tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu")
        # we cannot use broadcast for stateless dp group since it depends
        # on global rank
        torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
        return tensor.item()

    def compute_hash(self):
        """
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.

        This hash is also used for DP worker configuration validation
        to prevent hangs from mismatched collective communication patterns.
        """
        ignored_factors = {
            # Derived/runtime topology, networking, or launch details
            "data_parallel_rank",
            "data_parallel_rank_local",
            "data_parallel_size_local",
            "data_parallel_index",
            "data_parallel_backend",
            "data_parallel_external_lb",
            "data_parallel_hybrid_lb",
            "data_parallel_master_ip",
            "data_parallel_master_port",
            "_data_parallel_master_port_list",
            "data_parallel_rpc_port",
            "rank",
            "master_addr",
            "master_port",
            "node_rank",
            "nnodes",
            "max_parallel_loading_workers",
            "disable_custom_all_reduce",
            "ray_workers_use_nsight",
            "ray_runtime_env",
            "placement_group",
            "distributed_executor_backend",
            "worker_cls",
            "sd_worker_cls",
            "worker_extension_cls",
            "_api_process_count",
            "_api_process_rank",
            # NUMA binding is per-rank host-side memory locality; it does
            # not affect collective-communication semantics. When numa_bind
            # is enabled with auto-detection, each DP rank stores its own
            # NUMA node in numa_bind_nodes (see vllm/utils/numa_utils.py
            # `_get_numa_node`), which would otherwise diverge the DP hash.
            "numa_bind",
            "numa_bind_nodes",
            "numa_bind_cpus",
            "assigned_physical_gpu_ids",
        }

        from vllm.config.utils import get_hash_factors, hash_factors

        factors = get_hash_factors(self, ignored_factors)
        return hash_factors(factors)

    def __post_init__(self) -> None:
        # Continue with the rest of the initialization
        self.world_size = (
            self.pipeline_parallel_size
            * self.tensor_parallel_size
            * self.prefill_context_parallel_size
        )

        if self.distributed_executor_backend == "external_launcher":
            logger.info("Using external launcher for distributed inference.")
            self.world_size *= self.data_parallel_size

        if self.enable_elastic_ep:
            if not self.enable_eplb:
                raise ValueError("Elastic EP is only supported with enable_eplb=True.")
            if self.pipeline_parallel_size > 1:
                raise ValueError(
                    "Elastic EP is not supported with pipeline parallelism "
                    f"(pipeline_parallel_size={self.pipeline_parallel_size})."
                )
            if self.data_parallel_external_lb or self.data_parallel_hybrid_lb:
                raise NotImplementedError(
                    "Elastic EP is not compatible with data_parallel_external_lb "
                    "or data_parallel_hybrid_lb. Elastic EP relies on a single API "
                    "server and core client to coordinate scale up/down."
                )
            if self.eplb_config.use_async:
                from vllm.distributed.nixl_utils import is_nixl_available

                if not is_nixl_available():
                    raise ValueError(
                        "Elastic EP with async EPLB requires the NIXL "
                        "package. Either install NIXL or set "
                        "--eplb-config.use_async=false."
                    )

        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
            # Data parallel was specified in the engine args.
            if self.distributed_executor_backend == "external_launcher":
                # For external launcher,
                # we need to set the data parallel rank automatically
                self.data_parallel_rank = int(os.environ["RANK"]) // (
                    self.world_size // self.data_parallel_size
                )
                logger.info(
                    "Set data_parallel_rank to %d automatically.",
                    self.data_parallel_rank,
                )
            if not self.enable_elastic_ep:
                if not self._data_parallel_master_port_list:
                    self._data_parallel_master_port_list = get_open_ports_list(5)
                self.data_parallel_master_port = (
                    self._data_parallel_master_port_list.pop()
                )

            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
                raise ValueError(
                    f"data_parallel_rank ({self.data_parallel_rank})"
                    f" must be in the range [0, {self.data_parallel_size})"
                )
        else:
            # Otherwise fall back to env vars (e.g. for offline SPMD case).
            self.data_parallel_size = envs.VLLM_DP_SIZE
            self.data_parallel_rank = envs.VLLM_DP_RANK
            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT

            if self.data_parallel_size > 1 and self.is_moe_model is False:
                raise ValueError(
                    "Offline data parallel mode is not supported/useful"
                    " for dense models."
                )

        self.data_parallel_index = self.data_parallel_rank

        if self.distributed_executor_backend == "external_launcher":
            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
            logger.info("Disabling V1 multiprocessing for external launcher.")

        if self.distributed_executor_backend is None and self.world_size_across_dp > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.

            from vllm.v1.executor import ray_utils

            backend: DistributedExecutorBackend = "mp"
            ray_found = ray_utils.ray_is_available()
            if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                backend = "uni"
            elif current_platform.is_cuda() and self.nnodes > 1:
                backend = "mp"
            elif (
                current_platform.is_cuda()
                and current_platform.device_count() < self.world_size
            ):
                gpu_count = current_platform.device_count()
                raise ValueError(
                    f"World size ({self.world_size}) is larger than the number of "
                    f"available GPUs ({gpu_count}) in this node. If this is "
                    "intentional and you are using:\n"
                    "- ray, set '--distributed-executor-backend ray'.\n"
                    "- multiprocessing, set '--nnodes' appropriately."
                )
            elif self.data_parallel_backend == "ray":
                logger.info(
                    "Using ray distributed inference because "
                    "data_parallel_backend is ray"
                )
                backend = "ray"
            elif ray_found:
                if self.placement_group:
                    backend = "ray"
                else:
                    from ray import is_initialized as ray_is_initialized

                    if ray_is_initialized():
                        from ray.util import get_current_placement_group

                        if get_current_placement_group():
                            backend = "ray"
            self.distributed_executor_backend = backend
            logger.debug("Defaulting to use %s for distributed inference", backend)

        if self.distributed_executor_backend is None and self.world_size == 1:
            self.distributed_executor_backend = "uni"

        if self.max_parallel_loading_workers is not None:
            logger.warning(
                "max_parallel_loading_workers is currently "
                "not supported and will be ignored."
            )
        allowed_backends = ("mp", "uni", "external_launcher")
        if (
            self.distributed_executor_backend not in allowed_backends
            and self.nnodes > 1
        ):
            raise ValueError(
                "nnodes > 1 can only be set when distributed executor "
                "backend is mp, uni or external_launcher."
            )

        if self.enable_eplb and self.eplb_config.communicator is None:
            # Prefer NIXL when available: zero-copy RDMA reads, compatible
            # with both async EPLB and elastic EP (deferred remote setup).
            # Fallbacks: pynccl for elastic EP (stateless groups need it),
            # torch_gloo for static EP.  torch_nccl is avoided because NCCL
            # is incompatible with async EPLB (multi-stream conflicts) and
            # batched isend/irecv hangs under high load.
            # See https://github.com/pytorch/pytorch/issues/174288
            from vllm.distributed.nixl_utils import is_nixl_available

            if is_nixl_available():
                self.eplb_config.communicator = "nixl"
            elif self.enable_elastic_ep:
                self.eplb_config.communicator = "pynccl"
            else:
                self.eplb_config.communicator = "torch_gloo"

    @property
    def use_ray(self) -> bool:
        return self.distributed_executor_backend == "ray" or (
            isinstance(self.distributed_executor_backend, type)
            and getattr(self.distributed_executor_backend, "uses_ray", False)
        )

    @model_validator(mode="after")
    def _verify_args(self) -> Self:
        # Lazy import to avoid circular import
        from vllm.v1.executor import Executor

        # Enable batch invariance settings if requested
        if envs.VLLM_BATCH_INVARIANT:
            self.disable_custom_all_reduce = True

        if (
            self.distributed_executor_backend is not None
            and not isinstance(self.distributed_executor_backend, str)
            and not (
                isinstance(self.distributed_executor_backend, type)
                and issubclass(self.distributed_executor_backend, Executor)
            )
        ):
            raise ValueError(
                "Unrecognized distributed executor backend "
                f"{self.distributed_executor_backend}. Supported "
                "values are 'ray', 'mp' 'uni', 'external_launcher', "
                " custom Executor subclass or its import path."
            )
        if self.use_ray:
            from vllm.v1.executor import ray_utils

            ray_utils.assert_ray_available()

        if not current_platform.use_custom_allreduce():
            self.disable_custom_all_reduce = True
            logger.debug(
                "Disabled the custom all-reduce kernel because it is not "
                "supported on current platform."
            )
        if self.nnodes > 1:
            self.disable_custom_all_reduce = True
            logger.debug(
                "Disabled the custom all-reduce since we are running on multi-node."
            )
        if self.ray_workers_use_nsight and not self.use_ray:
            raise ValueError(
                "Unable to use nsight profiling unless workers run with Ray."
            )

        return self

_api_process_count = Field(default=1, gt=0) class-attribute instance-attribute

The number of API processes initialized.

Note

This is an internal config that is only valid for and should only be set by API server scale-out.

_api_process_rank = Field(default=0, ge=(-1)) class-attribute instance-attribute

The rank of this API process, or -1 for engine core processes under API server scale-out.

Note

This is an internal config that is only valid for and should only be set by API server scale-out.

_coord_store_port = 0 class-attribute instance-attribute

Port of the coordination TCPStore. Can be set by the API server; workers connect as clients to exchange self-picked group ports at runtime.

_data_parallel_master_port_list = Field(default_factory=list) class-attribute instance-attribute

List of open port auto-queried for data parallel messaging. Set to be private as it's not intended to be configured by users.

all2all_backend = 'allgather_reducescatter' class-attribute instance-attribute

All2All backend for MoE expert parallel communication. Available options:

  • "allgather_reducescatter": All2all based on allgather and reducescatter
  • "deepep_high_throughput": Use deepep high-throughput kernels
  • "deepep_low_latency": Use deepep low-latency kernels
  • "mori_high_throughput": MoRI EP with InterNodeV1 for multi-node
  • "mori_low_latency": MoRI EP with InterNodeV1LL for multi-node
  • "nixl_ep": Use nixl-ep kernels
  • "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
  • "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels

assigned_physical_gpu_ids = None class-attribute instance-attribute

Mapping from vLLM-local logical GPU IDs to physical GPU IDs.

For example, [2, 3] means logical GPU 0 maps to physical GPU 2, and logical GPU 1 maps to physical GPU 3. Physical IDs are used only at platform/topology boundaries such as NVML, NIC affinity, P2P checks, and final CUDA device selection when needed. When None, logical IDs map to visible device IDs in order.

cp_kv_cache_interleave_size = 1 class-attribute instance-attribute

Interleave size of kv_cache storage while using DCP or PCP. For total_cp_rank = pcp_rank * dcp_world_size + dcp_rank, and total_cp_world_size = pcp_world_size * dcp_world_size. store interleave_size tokens on total_cp_rank i, then store next interleave_size tokens on total_cp_rank i+1. Interleave_size=1: token-level alignment, where token i is stored on total_cp_rank i % total_cp_world_size. Interleave_size=block_size: block-level alignment, where tokens are first populated to the preceding ranks. Tokens are then stored in (rank i+1, block j) only after (rank i, block j) is fully occupied. Block_size should be greater than or equal to cp_kv_cache_interleave_size. Block_size should be divisible by cp_kv_cache_interleave_size.

cpu_distributed_timeout_seconds = None class-attribute instance-attribute

Timeout (in seconds) for cpu communication groups. If None, PyTorch's default timeout is used (1800s for gloo).

data_parallel_backend = 'mp' class-attribute instance-attribute

Backend to use for data parallel, either "mp" or "ray".

data_parallel_external_lb = False class-attribute instance-attribute

Whether to use "external" DP LB mode. Applies only to online serving and when data_parallel_size > 0. This is useful for a "one-pod-per-rank" wide-EP setup in Kubernetes. Supported only for MoE deployments; non-MoE models should use independent vLLM instances without --data-parallel-* arguments. Set implicitly when --data-parallel-rank is provided explicitly to vllm serve.

data_parallel_hybrid_lb = False class-attribute instance-attribute

Whether to use "hybrid" DP LB mode. Applies only to online serving and when data_parallel_size > 0. Enables running an AsyncLLM and API server on a "per-node" basis where vLLM load balances between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.

data_parallel_index = Field(init=False) class-attribute instance-attribute

Equal to the data parallel rank but not used for torch process groups and not overridden for dense models.

data_parallel_master_ip = '127.0.0.1' class-attribute instance-attribute

IP of the data parallel master.

data_parallel_master_port = 29500 class-attribute instance-attribute

Port of the data parallel master.

data_parallel_rank = Field(default=0, ge=0) class-attribute instance-attribute

Rank of the data parallel group. The runtime check at __post_init__ further bounds this by data_parallel_size.

data_parallel_rank_local = None class-attribute instance-attribute

Local rank of the data parallel group, set only in SPMD mode.

data_parallel_rpc_port = 29550 class-attribute instance-attribute

Port for data parallel messaging.

data_parallel_size = Field(default=1, ge=1) class-attribute instance-attribute

Number of data parallel groups. MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.

data_parallel_size_local = Field(default=1, ge=0) class-attribute instance-attribute

Number of local data parallel groups. A value of 0 is a sentinel used by the engine-args layer to signal that data parallelism was specified externally (see ParallelConfig.__post_init__).

dbo_decode_token_threshold = Field(default=32, ge=0) class-attribute instance-attribute

The threshold for dual batch overlap for batches only containing decodes. If the number of tokens in the request is greater than this threshold, microbatching will be used. Otherwise, the request will be processed in a single batch.

dbo_prefill_token_threshold = Field(default=512, ge=0) class-attribute instance-attribute

The threshold for dual batch overlap for batches that contain one or more prefills. If the number of tokens in the request is greater than this threshold, microbatching will be used. Otherwise, the request will be processed in a single batch.

dcp_comm_backend = 'ag_rs' class-attribute instance-attribute

Communication backend for Decode Context Parallel (DCP). - "ag_rs": AllGather + ReduceScatter (default, existing behavior) - "a2a": All-to-All exchange of partial outputs + LSE, then combine with Triton kernel. Reduces NCCL calls from 3 to 2 per layer for MLA models.

dcp_kv_cache_interleave_size = 1 class-attribute instance-attribute

Interleave size of kv_cache storage while using DCP. dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size, and will be deprecated when PCP is fully supported.

decode_context_parallel_size = Field(default=1, ge=1) class-attribute instance-attribute

Number of decode context parallel groups, because the world size does not change by dcp, it simply reuse the GPUs of TP group, and tp_size needs to be divisible by dcp_size.

disable_custom_all_reduce = False class-attribute instance-attribute

Disable the custom all-reduce kernel and fall back to NCCL.

disable_nccl_for_dp_synchronization = None class-attribute instance-attribute

Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py to use Gloo instead of NCCL for its all reduce.

Defaults to True when async scheduling is enabled, False otherwise.

distributed_executor_backend = None class-attribute instance-attribute

Backend to use for distributed model workers, either "ray" or "mp" (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size is less than or equal to the number of GPUs available, "mp" will be used to keep processing on a single host. Otherwise, an error will be raised. To use "mp" you must also set nnodes, and to use "ray" you must manually set distributed_executor_backend to "ray".

Note

TPU platform only supports Ray for distributed inference.

distributed_timeout_seconds = None class-attribute instance-attribute

Timeout in seconds for distributed operations (e.g., init_process_group). If set, this value is passed to torch.distributed.init_process_group as the timeout parameter. If None, PyTorch's default timeout is used (600s for NCCL). Increase this for multi-node setups where model downloads may be slow.

enable_dbo = False class-attribute instance-attribute

Enable dual batch overlap for the model executor.

enable_elastic_ep = False class-attribute instance-attribute

Enable elastic expert parallelism with stateless NCCL groups for DP/EP.

enable_ep_weight_filter = False class-attribute instance-attribute

Skip non-local expert weights during model loading when expert parallelism is active. Each rank only reads its own expert shard from disk, which can drastically reduce storage I/O for MoE models with per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5). Has no effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE models.

enable_eplb = False class-attribute instance-attribute

Enable expert parallelism load balancing for MoE layers.

enable_expert_parallel = False class-attribute instance-attribute

Use expert parallelism instead of tensor parallelism for MoE layers.

eplb_config = Field(default_factory=EPLBConfig) class-attribute instance-attribute

Expert parallelism configuration.

expert_placement_strategy = 'linear' class-attribute instance-attribute

The expert placement strategy for MoE layers:

  • "linear": Experts are placed in a contiguous manner. For example, with 4 experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have experts [2, 3].
  • "round_robin": Experts are placed in a round-robin manner. For example, with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1 will have experts [1, 3]. This strategy can help improve load balancing for grouped expert models with no redundant experts.

is_moe_model = None class-attribute instance-attribute

Whether the deployed model is MoE (if known).

local_engines_only property

Client manages local+remote EngineCores in pure internal LB case. Client manages local EngineCores in hybrid and external LB case.

master_addr = '127.0.0.1' class-attribute instance-attribute

distributed master address for multi-node distributed inference when distributed_executor_backend is mp.

master_port = 29501 class-attribute instance-attribute

distributed master port for multi-node distributed inference when distributed_executor_backend is mp.

max_parallel_loading_workers = Field(default=None, ge=1) class-attribute instance-attribute

Maximum number of parallel loading workers when loading model sequentially in multiple batches. To avoid RAM OOM when using tensor parallel and large models.

nnodes = Field(default=1, ge=1) class-attribute instance-attribute

num of nodes for multi-node distributed inference when distributed_executor_backend is mp.

node_rank = Field(default=0, ge=0) class-attribute instance-attribute

distributed node rank for multi-node distributed inference when distributed_executor_backend is mp.

numa_bind = False class-attribute instance-attribute

Enable NUMA binding for GPU worker subprocesses.

By default, workers are pinned to their GPU's NUMA-local CPUs and memory; on PCT-capable Xeons they also auto-bind to the SKU's PCT priority cores.

numa_bind_cpus = None class-attribute instance-attribute

Optional CPU lists to bind each GPU worker to.

Specify one CPU list per visible GPU, for example ["0-3", "4-7", "8-11", "12-15"]. When set, vLLM uses numactl --physcpubind instead of --cpunodebind. This is useful for custom policies such as binding to PCT or other high-frequency cores. Each entry must use numactl --physcpubind CPU-list syntax, for example "0-3" or "0,2,4-7".

numa_bind_nodes = None class-attribute instance-attribute

NUMA node to bind each GPU worker to.

Specify one NUMA node per visible GPU, for example [0, 0, 1, 1] for a 4-GPU system with GPUs 0-1 on NUMA node 0 and GPUs 2-3 on NUMA node 1. If unset and numa_bind=True, vLLM auto-detects the GPU-to-NUMA topology. The values are passed to numactl --membind and --cpunodebind, so they must be valid numactl NUMA node indices.

pipeline_parallel_size = Field(default=1, ge=1) class-attribute instance-attribute

Number of pipeline parallel groups.

placement_group = None class-attribute instance-attribute

ray distributed model workers placement group.

prefill_context_parallel_size = Field(default=1, ge=1) class-attribute instance-attribute

Number of prefill context parallel groups.

rank = 0 class-attribute instance-attribute

Global rank in distributed setup.

ray_runtime_env = None class-attribute instance-attribute

Ray runtime environment to pass to distributed workers.

ray_workers_use_nsight = False class-attribute instance-attribute

Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.

sd_worker_cls = 'auto' class-attribute instance-attribute

The full name of the worker class to use for speculative decoding. If "auto", the worker class will be determined based on the platform.

tensor_parallel_size = Field(default=1, ge=1) class-attribute instance-attribute

Number of tensor parallel groups.

ubatch_size = Field(default=0, ge=0) class-attribute instance-attribute

Number of ubatch size.

worker_cls = 'auto' class-attribute instance-attribute

The full name of the worker class to use. If "auto", the worker class will be determined based on the platform.

worker_extension_cls = '' class-attribute instance-attribute

The full name of the worker extension class to use. The worker extension class is dynamically inherited by the worker class. This is used to inject new attributes and methods to the worker class for use in collective_rpc calls.

world_size = Field(init=False) class-attribute instance-attribute

world_size is TPxPP, it affects the number of workers we create.

world_size_across_dp property

world_size_across_dp is TPxPPxDP, it is the size of the world including data parallelism.

_pick_stateless_dp_port()

Return (port, listen_socket) for DP group init.

With a coord store, rank 0 binds a socket and publishes the port; others read it. Without one, pops a pre-allocated port and returns listen_socket=None.

Source code in vllm/config/parallel.py
def _pick_stateless_dp_port(self) -> tuple[int, socket.socket | None]:
    """Return ``(port, listen_socket)`` for DP group init.

    With a coord store, rank 0 binds a socket and publishes the port;
    others read it.  Without one, pops a pre-allocated port and
    returns ``listen_socket=None``.
    """
    if not self._coord_store_port:
        return self.get_next_dp_init_port(), None

    from vllm.distributed.utils import get_cached_tcp_store_client

    store = get_cached_tcp_store_client(
        self.data_parallel_master_ip, self._coord_store_port
    )

    key = "dp_master_port"
    if self.data_parallel_rank == 0:
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind((self.data_parallel_master_ip, 0))
        s.listen()
        port = s.getsockname()[1]
        store.set(key, str(port).encode())
        return port, s
    else:
        return int(store.get(key).decode()), None

_skip_none_validation(value, handler) classmethod

Skip validation if the value is None when initialisation is delayed.

Source code in vllm/config/parallel.py
@field_validator("disable_nccl_for_dp_synchronization", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
    """Skip validation if the value is `None` when initialisation is delayed."""
    return None if value is None else handler(value)

compute_hash()

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

This hash is also used for DP worker configuration validation to prevent hangs from mismatched collective communication patterns.

Source code in vllm/config/parallel.py
def compute_hash(self):
    """
    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.

    This hash is also used for DP worker configuration validation
    to prevent hangs from mismatched collective communication patterns.
    """
    ignored_factors = {
        # Derived/runtime topology, networking, or launch details
        "data_parallel_rank",
        "data_parallel_rank_local",
        "data_parallel_size_local",
        "data_parallel_index",
        "data_parallel_backend",
        "data_parallel_external_lb",
        "data_parallel_hybrid_lb",
        "data_parallel_master_ip",
        "data_parallel_master_port",
        "_data_parallel_master_port_list",
        "data_parallel_rpc_port",
        "rank",
        "master_addr",
        "master_port",
        "node_rank",
        "nnodes",
        "max_parallel_loading_workers",
        "disable_custom_all_reduce",
        "ray_workers_use_nsight",
        "ray_runtime_env",
        "placement_group",
        "distributed_executor_backend",
        "worker_cls",
        "sd_worker_cls",
        "worker_extension_cls",
        "_api_process_count",
        "_api_process_rank",
        # NUMA binding is per-rank host-side memory locality; it does
        # not affect collective-communication semantics. When numa_bind
        # is enabled with auto-detection, each DP rank stores its own
        # NUMA node in numa_bind_nodes (see vllm/utils/numa_utils.py
        # `_get_numa_node`), which would otherwise diverge the DP hash.
        "numa_bind",
        "numa_bind_nodes",
        "numa_bind_cpus",
        "assigned_physical_gpu_ids",
    }

    from vllm.config.utils import get_hash_factors, hash_factors

    factors = get_hash_factors(self, ignored_factors)
    return hash_factors(factors)

get_next_dp_init_port()

We might need to initialize process groups in multiple processes that is related to data parallelism, e.g. both in the worker and in the engine, which can live in different processes. To avoid port conflicts, we pop a new port from the prepared port list each time we need to initialize a new process group related to data parallelism.

Source code in vllm/config/parallel.py
def get_next_dp_init_port(self) -> int:
    """
    We might need to initialize process groups in multiple
    processes that is related to data parallelism,
    e.g. both in the worker and in the engine, which
    can live in different processes. To avoid port conflicts, we
    pop a new port from the prepared port list each time we need to
    initialize a new process group related to data parallelism.
    """
    if self._data_parallel_master_port_list:
        answer = self._data_parallel_master_port_list.pop()
    else:
        answer = self.data_parallel_master_port
        self.data_parallel_master_port += 1

    return answer

sync_dp_state(dp_group, has_unfinished, pending_pause) staticmethod

Combined all-reduce for DP state synchronization.

Uses a single SUM all-reduce on a 2-element tensor

[0] = 1 if this rank has unfinished work, else 0. SUM > 0 ≡ logical OR across ranks → any rank has work. [1] = 1 if this rank has a pending pause request, else 0. SUM == dp_size ≡ all ranks reached pause consensus.

has_unfinished_global is true if any rank has unfinished work, or if some ranks are waiting for a pause consensus.

Returns:

Source code in vllm/config/parallel.py
@staticmethod
def sync_dp_state(
    dp_group: ProcessGroup, has_unfinished: bool, pending_pause: bool
) -> tuple[bool, bool]:
    """Combined all-reduce for DP state synchronization.

    Uses a single SUM all-reduce on a 2-element tensor:
      [0] = 1 if this rank has unfinished work, else 0.
            SUM > 0 ≡ logical OR across ranks → any rank has work.
      [1] = 1 if this rank has a pending pause request, else 0.
            SUM == dp_size ≡ all ranks reached pause consensus.

    has_unfinished_global is true if any rank has unfinished work,
    or if some ranks are waiting for a pause consensus.

    Returns:
        (has_unfinished_global, pause_consensus)
    """
    tensor = torch.tensor(
        [int(has_unfinished), int(pending_pause)], dtype=torch.int32, device="cpu"
    )
    torch.distributed.all_reduce(tensor, op=ReduceOp.SUM, group=dp_group)
    dp_size = dp_group.size()
    pause_count = tensor[1].item()
    has_unfinished_global = tensor[0].item() > 0 or pause_count % dp_size != 0
    return has_unfinished_global, pause_count == dp_size

PassConfig

Configuration for custom Inductor passes.

This is separate from general CompilationConfig so that inductor passes don't all have access to full configuration - that would create a cycle as the PassManager is set as a property of config.

You must pass PassConfig to VLLMConfig constructor via the CompilationConfig constructor. VLLMConfig's post_init does further initialization. If used outside of the VLLMConfig, some fields may be left in an improper state.

Methods:

Attributes:

Source code in vllm/config/compilation.py
@config
class PassConfig:
    """Configuration for custom Inductor passes.

    This is separate from general `CompilationConfig` so that inductor passes
    don't all have access to full configuration - that would create a cycle as
    the `PassManager` is set as a property of config.

    You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
    constructor. VLLMConfig's post_init does further initialization.
    If used outside of the VLLMConfig, some fields may be left in an
    improper state.
    """

    # New flags
    fuse_norm_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom RMSNorm + quant ops."""
    fuse_act_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom SiluMul + quant ops."""
    fuse_attn_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom Attention and MLAAttention + quant ops."""
    eliminate_noops: bool = Field(default=True)
    """Eliminate no-op ops."""
    enable_sp: bool = None  # type: ignore[assignment]
    """Enable sequence parallelism. Requires TP>1. Automatically disabled
    if the model's hidden_size is too small for SP to be beneficial
    (threshold is device-capability dependent)."""
    fuse_gemm_comms: bool = None  # type: ignore[assignment]
    """Enable async TP."""
    fuse_allreduce_rms: bool = None  # type: ignore[assignment]
    """Enable flashinfer allreduce fusion."""
    enable_qk_norm_rope_fusion: bool = None  # type: ignore[assignment]
    """Enable fused Q/K RMSNorm + RoPE pass."""
    fuse_rope_kvcache_cat_mla: bool = None  # type: ignore[assignment]
    """Enable fused MLA KV cache update with RoPE."""

    # ROCm/AITER specific fusions
    fuse_act_padding: bool = None  # type: ignore[assignment]
    """Fuse the custom RMSNorm + padding ops."""
    fuse_mla_dual_rms_norm: bool = None  # type: ignore[assignment]
    """Fuse paired q/kv RMS norms in MLA attention."""
    fuse_rope_kvcache: bool = None  # type: ignore[assignment]
    """Fuse the QK rope + KV cache ops."""

    rope_kvcache_fusion_max_token_num: int = 256
    """The threshold for ROCm AITER RoPE+KVCache fusion e.g. for small batch decode.
    Larger batch sizes e.g. during prefill will use the unfused kernels.
    """

    fi_allreduce_fusion_max_size_mb: float | None = None
    """The threshold of the communicated tensor sizes under which
    vllm should use flashinfer fused allreduce. Specified as a
    float in MB.
    Unspecified will fallback to default values
    which are compute capability and world size dependent.
        FI_ALLREDUCE_FUSION_MAX_SIZE_MB = {
            90: {
                2: 64,  # 64MB
                4: 2,  # 2MB
                8: 1,  # 1MB
            },
            100: {
                2: 64,  # 64MB
                4: 32,  # 32MB
                8: 1,  # 1MB
            },
        }, where key is the device capability"""
    sp_min_token_num: int | None = None
    """The minimum number of tokens above which vllm should use
    sequence parallelism. Specified as an integer token count.
    Unspecified will fallback to default values which are compute
    capability and world size dependent."""

    # TODO(luka) better pass enabling system.

    def flashinfer_max_size(self, world_size: int) -> int | None:
        """
        Returns the max communication size in bytes for flashinfer
        allreduce fusion for the given world size. Returns None if world size
        is not supported by configs as it's not supported by flashinfer.
        """

        MiB = 1024 * 1024
        FI_SUPPORTED_WORLD_SIZES = [2, 4, 8, 16]
        if world_size not in FI_SUPPORTED_WORLD_SIZES:
            return None
        max_size_mb = self.fi_allreduce_fusion_max_size_mb
        if max_size_mb is None:
            max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)

        return int(max_size_mb * MiB) if max_size_mb is not None else None

    @staticmethod
    def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]:
        from vllm.compilation.passes.fusion.allreduce_rms_fusion import (
            FI_ALLREDUCE_FUSION_MAX_SIZE_MB,
        )
        from vllm.platforms import current_platform

        if not current_platform.is_cuda():
            return {}
        capability = current_platform.get_device_capability()
        if capability is None:
            return {}
        return FI_ALLREDUCE_FUSION_MAX_SIZE_MB.get(capability.to_int(), {})

    def compute_hash(self) -> str:
        """
        Produces a hash unique to the pass configuration.
        Any new fields that affect compilation should be added to the hash.
        Any future fields that don't affect compilation should be excluded.
        """

        return hash_factors(get_hash_factors(self, set()))

    @field_validator(
        "fuse_norm_quant",
        "fuse_act_quant",
        "fuse_attn_quant",
        "enable_sp",
        "fuse_gemm_comms",
        "fuse_allreduce_rms",
        "fuse_act_padding",
        "fuse_mla_dual_rms_norm",
        "fuse_rope_kvcache",
        "fuse_rope_kvcache_cat_mla",
        mode="wrap",
    )
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialisation is delayed."""
        if value is None:
            return value
        return handler(value)

    def __post_init__(self) -> None:
        # Handle deprecation and defaults

        if not self.eliminate_noops:
            if self.fuse_norm_quant or self.fuse_act_quant:
                logger.warning_once(
                    "Fusion enabled but reshape elimination disabled. "
                    "RMSNorm/SiluMul + quant (fp8) fusion might not work"
                )
            if self.fuse_attn_quant:
                logger.warning_once(
                    "Fusion enabled but reshape elimination disabled. "
                    "Attention + quant (fp8) fusion might not work"
                )
            if self.fuse_allreduce_rms:
                logger.warning_once(
                    "Fusion enabled but reshape elimination disabled. "
                    "Allreduce + rms norm + quant (fp8) fusion might not work"
                )
            if self.fuse_act_padding:
                logger.warning_once(
                    "Fusion enabled but reshape elimination disabled. "
                    "RMSNorm + padding fusion might not work"
                )
        if self.enable_qk_norm_rope_fusion and not (
            current_platform.is_cuda_alike() or current_platform.is_xpu()
        ):
            logger.warning_once(
                "QK Norm + RoPE fusion enabled but the current platform is not "
                "CUDA, ROCm or XPU. The fusion will be disabled."
            )
            self.enable_qk_norm_rope_fusion = False
        if self.fuse_act_padding and not current_platform.is_rocm():
            logger.warning_once(
                "Padding fusion enabled but the current platform is not ROCm. "
                "The fusion will be disabled."
            )
            self.fuse_act_padding = False
        if self.fuse_mla_dual_rms_norm and not current_platform.is_rocm():
            logger.warning_once(
                "MLA dual RMS norm fusion requires ROCm/AITER. "
                "The fusion will be disabled."
            )
            self.fuse_mla_dual_rms_norm = False
        if self.fuse_rope_kvcache and not current_platform.is_rocm():
            logger.warning_once(
                "KV cache fusion currently only enabled on ROCm. "
                "The fusion will be disabled."
            )
            self.fuse_rope_kvcache = False
        if self.fuse_rope_kvcache_cat_mla and not current_platform.is_cuda_alike():
            logger.warning_once(
                "MLA KV cache update with RoPE fusion enabled but the "
                "current platform is not CUDA or ROCm. The fusion will be disabled."
            )
            self.fuse_rope_kvcache_cat_mla = False

    def log_enabled_passes(self) -> None:
        """
        Log the enabled custom fusion passes.
        This is called at the end of VLLMConfig post_init,
        after all defaults are finalized.
        TODO also log the compile ranges for which this is enabled.
        """
        enabled_fusions = [
            f.name[len("fuse_") :]
            for f in fields(self)  # type: ignore[arg-type]
            if getattr(self, f.name) and f.name.startswith("fuse_")
        ]

        if enabled_fusions:
            logger.info_once(
                "Enabled custom fusions: %s", ", ".join(enabled_fusions), scope="global"
            )

eliminate_noops = Field(default=True) class-attribute instance-attribute

Eliminate no-op ops.

enable_qk_norm_rope_fusion = None class-attribute instance-attribute

Enable fused Q/K RMSNorm + RoPE pass.

enable_sp = None class-attribute instance-attribute

Enable sequence parallelism. Requires TP>1. Automatically disabled if the model's hidden_size is too small for SP to be beneficial (threshold is device-capability dependent).

fi_allreduce_fusion_max_size_mb = None class-attribute instance-attribute

The threshold of the communicated tensor sizes under which vllm should use flashinfer fused allreduce. Specified as a float in MB. Unspecified will fallback to default values which are compute capability and world size dependent. FI_ALLREDUCE_FUSION_MAX_SIZE_MB = { 90: { 2: 64, # 64MB 4: 2, # 2MB 8: 1, # 1MB }, 100: { 2: 64, # 64MB 4: 32, # 32MB 8: 1, # 1MB }, }, where key is the device capability

fuse_act_padding = None class-attribute instance-attribute

Fuse the custom RMSNorm + padding ops.

fuse_act_quant = None class-attribute instance-attribute

Fuse the custom SiluMul + quant ops.

fuse_allreduce_rms = None class-attribute instance-attribute

Enable flashinfer allreduce fusion.

fuse_attn_quant = None class-attribute instance-attribute

Fuse the custom Attention and MLAAttention + quant ops.

fuse_gemm_comms = None class-attribute instance-attribute

Enable async TP.

fuse_mla_dual_rms_norm = None class-attribute instance-attribute

Fuse paired q/kv RMS norms in MLA attention.

fuse_norm_quant = None class-attribute instance-attribute

Fuse the custom RMSNorm + quant ops.

fuse_rope_kvcache = None class-attribute instance-attribute

Fuse the QK rope + KV cache ops.

fuse_rope_kvcache_cat_mla = None class-attribute instance-attribute

Enable fused MLA KV cache update with RoPE.

rope_kvcache_fusion_max_token_num = 256 class-attribute instance-attribute

The threshold for ROCm AITER RoPE+KVCache fusion e.g. for small batch decode. Larger batch sizes e.g. during prefill will use the unfused kernels.

sp_min_token_num = None class-attribute instance-attribute

The minimum number of tokens above which vllm should use sequence parallelism. Specified as an integer token count. Unspecified will fallback to default values which are compute capability and world size dependent.

_skip_none_validation(value, handler) classmethod

Skip validation if the value is None when initialisation is delayed.

Source code in vllm/config/compilation.py
@field_validator(
    "fuse_norm_quant",
    "fuse_act_quant",
    "fuse_attn_quant",
    "enable_sp",
    "fuse_gemm_comms",
    "fuse_allreduce_rms",
    "fuse_act_padding",
    "fuse_mla_dual_rms_norm",
    "fuse_rope_kvcache",
    "fuse_rope_kvcache_cat_mla",
    mode="wrap",
)
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
    """Skip validation if the value is `None` when initialisation is delayed."""
    if value is None:
        return value
    return handler(value)

compute_hash()

Produces a hash unique to the pass configuration. Any new fields that affect compilation should be added to the hash. Any future fields that don't affect compilation should be excluded.

Source code in vllm/config/compilation.py
def compute_hash(self) -> str:
    """
    Produces a hash unique to the pass configuration.
    Any new fields that affect compilation should be added to the hash.
    Any future fields that don't affect compilation should be excluded.
    """

    return hash_factors(get_hash_factors(self, set()))

flashinfer_max_size(world_size)

Returns the max communication size in bytes for flashinfer allreduce fusion for the given world size. Returns None if world size is not supported by configs as it's not supported by flashinfer.

Source code in vllm/config/compilation.py
def flashinfer_max_size(self, world_size: int) -> int | None:
    """
    Returns the max communication size in bytes for flashinfer
    allreduce fusion for the given world size. Returns None if world size
    is not supported by configs as it's not supported by flashinfer.
    """

    MiB = 1024 * 1024
    FI_SUPPORTED_WORLD_SIZES = [2, 4, 8, 16]
    if world_size not in FI_SUPPORTED_WORLD_SIZES:
        return None
    max_size_mb = self.fi_allreduce_fusion_max_size_mb
    if max_size_mb is None:
        max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)

    return int(max_size_mb * MiB) if max_size_mb is not None else None

log_enabled_passes()

Log the enabled custom fusion passes. This is called at the end of VLLMConfig post_init, after all defaults are finalized. TODO also log the compile ranges for which this is enabled.

Source code in vllm/config/compilation.py
def log_enabled_passes(self) -> None:
    """
    Log the enabled custom fusion passes.
    This is called at the end of VLLMConfig post_init,
    after all defaults are finalized.
    TODO also log the compile ranges for which this is enabled.
    """
    enabled_fusions = [
        f.name[len("fuse_") :]
        for f in fields(self)  # type: ignore[arg-type]
        if getattr(self, f.name) and f.name.startswith("fuse_")
    ]

    if enabled_fusions:
        logger.info_once(
            "Enabled custom fusions: %s", ", ".join(enabled_fusions), scope="global"
        )

PoolerConfig

Controls the behavior of output pooling in pooling models.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

  • dimensions (int | None) –

    Reduce the dimensions of embeddings if model

  • enable_chunked_processing (bool) –

    Whether to enable chunked processing for long inputs that exceed the model's

  • logit_mean (float | None) –

    If provided, subtract this value from classification logits before

  • logit_sigma (float | None) –

    If provided, divide the classification logits by this value after

  • max_embed_len (int | None) –

    Maximum input length allowed for embedding generation. When set, allows

  • pooling_type (SequencePoolingType | TokenPoolingType | None) –

    The pooling method used for pooling.

  • returned_token_ids (list[int] | None) –

    A list of indices for the vocabulary dimensions to be extracted,

  • seq_pooling_type (SequencePoolingType | None) –

    The pooling method used for sequence pooling.

  • step_tag_id (int | None) –

    If set, only the score corresponding to the step_tag_id in the

  • task (PoolingTask | None) –

    The task used for pooling.

  • tok_pooling_type (TokenPoolingType | None) –

    The pooling method used for tokenwise pooling.

  • use_activation (bool | None) –

    Whether to apply activation function to the pooler outputs.

Source code in vllm/config/pooler.py
@config
class PoolerConfig:
    """Controls the behavior of output pooling in pooling models."""

    task: PoolingTask | None = None
    """
    The task used for pooling.
    """

    pooling_type: SequencePoolingType | TokenPoolingType | None = None
    """
    The pooling method used for pooling.

    If set, `seq_pooling_type` or `tok_pooling_type` are automatically populated
    with this field. Alternatively, users can set `seq_pooling_type` and
    `tok_pooling_type` explicitly.

    This field is mainly for user convenience. Internal code should always use
    `seq_pooling_type` or `tok_pooling_type` instead of `pooling_type`.
    """

    seq_pooling_type: SequencePoolingType | None = None
    """
    The pooling method used for sequence pooling.
    """

    tok_pooling_type: TokenPoolingType | None = None
    """
    The pooling method used for tokenwise pooling.
    """

    use_activation: bool | None = None
    """
    Whether to apply activation function to the pooler outputs.
    `None` uses the pooler's default, which is `True` in most cases.
    """

    ## for embedding models
    dimensions: int | None = None
    """
    Reduce the dimensions of embeddings if model
    support matryoshka representation. Defaults to None.
    """
    enable_chunked_processing: bool = False
    """
    Whether to enable chunked processing for long inputs that exceed the model's
    maximum position embeddings. When enabled, long inputs will be split into
    chunks, processed separately, and then aggregated using weighted averaging.
    This allows embedding models to handle arbitrarily long text without CUDA
    errors. Defaults to False.
    """
    max_embed_len: int | None = None
    """
    Maximum input length allowed for embedding generation. When set, allows
    inputs longer than max_embed_len to be accepted for embedding models.
    When an input exceeds max_embed_len, it will be handled according to 
    the original max_model_len validation logic. 
    Defaults to None (i.e. set to max_model_len).
    """

    ## for classification models — affine score calibration
    logit_mean: float | None = None
    """
    If provided, subtract this value from classification logits before
    activation. Used for affine score calibration (Platt scaling):
    activation((logit - logit_mean) / logit_sigma). Defaults to None.
    """

    logit_sigma: float | None = None
    """
    If provided, divide the classification logits by this value after
    mean subtraction. Used for affine score calibration (Platt scaling):
    activation((logit - logit_mean) / logit_sigma). Defaults to None.
    """

    ## for reward models
    step_tag_id: int | None = None
    """
    If set, only the score corresponding to the `step_tag_id` in the
    generated sentence should be returned. Otherwise, the scores for all tokens
    are returned.
    """
    returned_token_ids: list[int] | None = None
    """
    A list of indices for the vocabulary dimensions to be extracted,
    such as the token IDs of `good_token` and `bad_token` in the
    `math-shepherd-mistral-7b-prm` model.
    """

    def __post_init__(self) -> None:
        if self.logit_sigma is not None and self.logit_sigma == 0:
            raise ValueError("logit_sigma cannot be 0 (division by zero)")

        if pooling_type := self.pooling_type:
            if self.seq_pooling_type is not None:
                raise ValueError(
                    "Cannot set both `pooling_type` and `seq_pooling_type`"
                )
            if self.tok_pooling_type is not None:
                raise ValueError(
                    "Cannot set both `pooling_type` and `tok_pooling_type`"
                )

            if pooling_type in SEQ_POOLING_TYPES:
                logger.debug(
                    "Resolved `pooling_type=%r` to `seq_pooling_type=%r`.",
                    pooling_type,
                    pooling_type,
                )
                self.seq_pooling_type = pooling_type  # type: ignore[assignment]
            elif pooling_type in TOK_POOLING_TYPES:
                logger.debug(
                    "Resolved `pooling_type=%r` to `tok_pooling_type=%r`.",
                    pooling_type,
                    pooling_type,
                )
                self.tok_pooling_type = pooling_type  # type: ignore[assignment]
            else:
                raise NotImplementedError(pooling_type)

    def get_seq_pooling_type(self) -> SequencePoolingType:
        if self.seq_pooling_type is None:
            raise ValueError(
                "seq_pooling_type is not set; it should be resolved by"
                " ModelConfig before calling get_seq_pooling_type()"
            )
        return self.seq_pooling_type

    def get_tok_pooling_type(self) -> TokenPoolingType:
        if self.tok_pooling_type is None:
            raise ValueError(
                "tok_pooling_type is not set; it should be resolved by"
                " ModelConfig before calling get_tok_pooling_type()"
            )
        return self.tok_pooling_type

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

dimensions = None class-attribute instance-attribute

Reduce the dimensions of embeddings if model support matryoshka representation. Defaults to None.

enable_chunked_processing = False class-attribute instance-attribute

Whether to enable chunked processing for long inputs that exceed the model's maximum position embeddings. When enabled, long inputs will be split into chunks, processed separately, and then aggregated using weighted averaging. This allows embedding models to handle arbitrarily long text without CUDA errors. Defaults to False.

logit_mean = None class-attribute instance-attribute

If provided, subtract this value from classification logits before activation. Used for affine score calibration (Platt scaling): activation((logit - logit_mean) / logit_sigma). Defaults to None.

logit_sigma = None class-attribute instance-attribute

If provided, divide the classification logits by this value after mean subtraction. Used for affine score calibration (Platt scaling): activation((logit - logit_mean) / logit_sigma). Defaults to None.

max_embed_len = None class-attribute instance-attribute

Maximum input length allowed for embedding generation. When set, allows inputs longer than max_embed_len to be accepted for embedding models. When an input exceeds max_embed_len, it will be handled according to the original max_model_len validation logic. Defaults to None (i.e. set to max_model_len).

pooling_type = None class-attribute instance-attribute

The pooling method used for pooling.

If set, seq_pooling_type or tok_pooling_type are automatically populated with this field. Alternatively, users can set seq_pooling_type and tok_pooling_type explicitly.

This field is mainly for user convenience. Internal code should always use seq_pooling_type or tok_pooling_type instead of pooling_type.

returned_token_ids = None class-attribute instance-attribute

A list of indices for the vocabulary dimensions to be extracted, such as the token IDs of good_token and bad_token in the math-shepherd-mistral-7b-prm model.

seq_pooling_type = None class-attribute instance-attribute

The pooling method used for sequence pooling.

step_tag_id = None class-attribute instance-attribute

If set, only the score corresponding to the step_tag_id in the generated sentence should be returned. Otherwise, the scores for all tokens are returned.

task = None class-attribute instance-attribute

The task used for pooling.

tok_pooling_type = None class-attribute instance-attribute

The pooling method used for tokenwise pooling.

use_activation = None class-attribute instance-attribute

Whether to apply activation function to the pooler outputs. None uses the pooler's default, which is True in most cases.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/pooler.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

PrefetchOffloadConfig

Configuration for prefetch-based CPU offloading.

Groups layers and uses async H2D prefetch to hide transfer latency.

Attributes:

Source code in vllm/config/offload.py
@config
class PrefetchOffloadConfig:
    """Configuration for prefetch-based CPU offloading.

    Groups layers and uses async H2D prefetch to hide transfer latency.
    """

    offload_group_size: int = Field(default=0, ge=0)
    """Group every N layers together. Offload last `offload_num_in_group`
    layers of each group. Default is 0 (disabled).
    Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,...
    Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer
    latency.
    """

    offload_num_in_group: int = Field(default=1, ge=1)
    """Number of layers to offload per group.
    Must be <= offload_group_size. Default is 1."""

    offload_prefetch_step: int = Field(default=1, ge=0)
    """Number of layers to prefetch ahead.
    Higher values hide more latency but use more GPU memory. Default is 1."""

    offload_params: set[str] = Field(default_factory=set)
    """The set of parameter name segments to target for prefetch offloading.
    Unmatched parameters are not offloaded. If this set is empty, ALL
    parameters of each offloaded layer are offloaded.
    Uses segment matching: "w13_weight" matches "mlp.experts.w13_weight"
    but not "mlp.experts.w13_weight_scale".
    """

offload_group_size = Field(default=0, ge=0) class-attribute instance-attribute

Group every N layers together. Offload last offload_num_in_group layers of each group. Default is 0 (disabled). Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,... Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer latency.

offload_num_in_group = Field(default=1, ge=1) class-attribute instance-attribute

Number of layers to offload per group. Must be <= offload_group_size. Default is 1.

offload_params = Field(default_factory=set) class-attribute instance-attribute

The set of parameter name segments to target for prefetch offloading. Unmatched parameters are not offloaded. If this set is empty, ALL parameters of each offloaded layer are offloaded. Uses segment matching: "w13_weight" matches "mlp.experts.w13_weight" but not "mlp.experts.w13_weight_scale".

offload_prefetch_step = Field(default=1, ge=0) class-attribute instance-attribute

Number of layers to prefetch ahead. Higher values hide more latency but use more GPU memory. Default is 1.

ProfilerConfig

Dataclass which contains profiler config for the engine.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

Source code in vllm/config/profiler.py
@config
class ProfilerConfig:
    """Dataclass which contains profiler config for the engine."""

    profiler: ProfilerKind | None = None
    """Which profiler to use. Defaults to None. Options are:

    - 'torch': Use PyTorch profiler.
    - 'cuda': Use CUDA profiler."""

    torch_profiler_dir: str = ""
    """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
    worker's traces (CPU & GPU) will be saved under this directory. Note that
    it must be an absolute path."""

    torch_profiler_with_stack: bool = True
    """If `True`, enables stack tracing in the torch profiler. Enabled by default
    as it is useful for debugging. Can be disabled via 
    --profiler-config.torch_profiler_with_stack=false CLI flag."""

    torch_profiler_with_flops: bool = False
    """If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""

    torch_profiler_use_gzip: bool = True
    """If `True`, saves torch profiler traces in gzip format. Enabled by default"""

    torch_profiler_dump_cuda_time_total: bool = True
    """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""

    torch_profiler_record_shapes: bool = False
    """If `True`, records tensor shapes in the torch profiler. Disabled by default."""

    torch_profiler_with_memory: bool = False
    """If `True`, enables memory profiling in the torch profiler.
    Disabled by default."""

    ignore_frontend: bool = False
    """If `True`, disables the front-end profiling of AsyncLLM when using the
    'torch' profiler. This is needed to reduce overhead when using delay/limit options,
    since the front-end profiling does not track iterations and will capture the
    entire range.
    """

    delay_iterations: int = Field(default=0, ge=0)
    """Number of engine iterations to skip before starting profiling.
    Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
    """

    max_iterations: int = Field(default=0, ge=0)
    """Maximum number of engine iterations to profile after starting profiling.
    Defaults to 0, meaning no limit.
    """

    warmup_iterations: int = Field(default=0, ge=0)
    """Number of warmup iterations for PyTorch profiler schedule.
    During warmup, the profiler runs but data is discarded. This helps reduce
    noise from JIT compilation and other one-time costs in the profiled trace.
    Defaults to 0 (schedule-based profiling disabled, recording all iterations).
    Set to a positive value (e.g., 2) to enable schedule-based profiling.
    """

    active_iterations: int = Field(default=5, ge=1)
    """Number of active iterations for PyTorch profiler schedule.
    This is the number of iterations where profiling data is actually collected.
    Defaults to 5 active iterations.
    """

    wait_iterations: int = Field(default=0, ge=0)
    """Number of wait iterations for PyTorch profiler schedule.
    During wait, the profiler is completely off with zero overhead.
    This allows skipping initial iterations before warmup begins.
    Defaults to 0 (no wait period).
    """

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @model_validator(mode="after")
    def _validate_profiler_config(self) -> Self:
        has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
        if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
            logger.warning_once(
                "Using 'torch' profiler with delay_iterations or max_iterations "
                "while ignore_frontend is False may result in high overhead."
            )

        profiler_dir = self.torch_profiler_dir
        if profiler_dir and self.profiler != "torch":
            raise ValueError(
                "torch_profiler_dir is only applicable when profiler is set to 'torch'"
            )
        if self.profiler == "torch" and not profiler_dir:
            raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")

        # Support any URI scheme (gs://, s3://, hdfs://, etc.)
        # These paths should not be converted to absolute paths
        if profiler_dir and not _is_uri_path(profiler_dir):
            self.torch_profiler_dir = os.path.abspath(os.path.expanduser(profiler_dir))

        return self

active_iterations = Field(default=5, ge=1) class-attribute instance-attribute

Number of active iterations for PyTorch profiler schedule. This is the number of iterations where profiling data is actually collected. Defaults to 5 active iterations.

delay_iterations = Field(default=0, ge=0) class-attribute instance-attribute

Number of engine iterations to skip before starting profiling. Defaults to 0, meaning profiling starts immediately after receiving /start_profile.

ignore_frontend = False class-attribute instance-attribute

If True, disables the front-end profiling of AsyncLLM when using the 'torch' profiler. This is needed to reduce overhead when using delay/limit options, since the front-end profiling does not track iterations and will capture the entire range.

max_iterations = Field(default=0, ge=0) class-attribute instance-attribute

Maximum number of engine iterations to profile after starting profiling. Defaults to 0, meaning no limit.

profiler = None class-attribute instance-attribute

Which profiler to use. Defaults to None. Options are:

  • 'torch': Use PyTorch profiler.
  • 'cuda': Use CUDA profiler.

torch_profiler_dir = '' class-attribute instance-attribute

Directory to save torch profiler traces. Both AsyncLLM's CPU traces and worker's traces (CPU & GPU) will be saved under this directory. Note that it must be an absolute path.

torch_profiler_dump_cuda_time_total = True class-attribute instance-attribute

If True, dumps total CUDA time in torch profiler traces. Enabled by default.

torch_profiler_record_shapes = False class-attribute instance-attribute

If True, records tensor shapes in the torch profiler. Disabled by default.

torch_profiler_use_gzip = True class-attribute instance-attribute

If True, saves torch profiler traces in gzip format. Enabled by default

torch_profiler_with_flops = False class-attribute instance-attribute

If True, enables FLOPS counting in the torch profiler. Disabled by default.

torch_profiler_with_memory = False class-attribute instance-attribute

If True, enables memory profiling in the torch profiler. Disabled by default.

torch_profiler_with_stack = True class-attribute instance-attribute

If True, enables stack tracing in the torch profiler. Enabled by default as it is useful for debugging. Can be disabled via --profiler-config.torch_profiler_with_stack=false CLI flag.

wait_iterations = Field(default=0, ge=0) class-attribute instance-attribute

Number of wait iterations for PyTorch profiler schedule. During wait, the profiler is completely off with zero overhead. This allows skipping initial iterations before warmup begins. Defaults to 0 (no wait period).

warmup_iterations = Field(default=0, ge=0) class-attribute instance-attribute

Number of warmup iterations for PyTorch profiler schedule. During warmup, the profiler runs but data is discarded. This helps reduce noise from JIT compilation and other one-time costs in the profiled trace. Defaults to 0 (schedule-based profiling disabled, recording all iterations). Set to a positive value (e.g., 2) to enable schedule-based profiling.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/profiler.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

ReasoningConfig

Configuration for reasoning models.

Set reasoning_start_str and reasoning_end_str to the strings that delimit the reasoning block (e.g. "<think>" and "</think>"). The corresponding token IDs are derived automatically via initialize_token_ids and are not intended to be set directly.

Methods:

Attributes:

Source code in vllm/config/reasoning.py
@config
class ReasoningConfig:
    """Configuration for reasoning models.

    Set `reasoning_start_str` and `reasoning_end_str` to the strings that delimit
    the reasoning block (e.g. `"<think>"` and `"</think>"`).  The
    corresponding token IDs are derived automatically via
    `initialize_token_ids` and are not intended to be set directly.
    """

    reasoning_parser: str = ""
    """The name of the ReasoningParser to use for this model."""
    reasoning_start_str: str = ""
    """String that indicates the start of reasoning."""
    reasoning_end_str: str = ""
    """String that indicates the end of reasoning content."""

    _reasoning_start_token_ids: list[int] | None = field(
        default=None, init=False, repr=False
    )
    """Private backing field for `reasoning_start_token_ids`. Set by
    `initialize_token_ids`. Not intended to be configured directly."""
    _reasoning_end_token_ids: list[int] | None = field(
        default=None, init=False, repr=False
    )
    """Private backing field for `reasoning_end_token_ids`. Set by
    `initialize_token_ids`. Not intended to be configured directly."""

    _enabled: bool = field(default=False, init=False, repr=False)
    """Private field indicating whether reasoning token IDs have been initialized.
    Set to True by `initialize_token_ids` once token IDs are initialized."""

    @property
    def enabled(self) -> bool:
        """Returns True if reasoning is enabled (i.e. if token IDs have been
        initialized), False otherwise."""
        return self._enabled

    @property
    def reasoning_start_token_ids(self) -> list[int] | None:
        """Token IDs derived from `reasoning_start_str`. Set automatically by
        `initialize_token_ids`. Not intended to be configured directly."""
        return self._reasoning_start_token_ids

    @property
    def reasoning_end_token_ids(self) -> list[int] | None:
        """Token IDs derived from `reasoning_end_str`. Set automatically by
        `initialize_token_ids`. Not intended to be configured directly."""
        return self._reasoning_end_token_ids

    def initialize_token_ids(self, model_config: ModelConfig) -> None:
        """Initialize reasoning token IDs from strings using the tokenizer."""
        if (
            self._reasoning_start_token_ids is not None
            and self._reasoning_end_token_ids is not None
        ):
            self._enabled = True
            return  # Already initialized

        tokenizer = cached_tokenizer_from_config(model_config=model_config)
        reasoning_start_str = self.reasoning_start_str
        reasoning_end_str = self.reasoning_end_str
        if self.reasoning_parser is not None and (
            not reasoning_start_str or not reasoning_end_str
        ):
            parser_cls = ReasoningParserManager.get_reasoning_parser(
                self.reasoning_parser
            )
            reasoning_parser = parser_cls(tokenizer)
            start_token = reasoning_parser.reasoning_start_str
            if start_token and not reasoning_start_str:
                reasoning_start_str = start_token

            end_token = reasoning_parser.reasoning_end_str
            if end_token and not reasoning_end_str:
                reasoning_end_str = end_token

        if not reasoning_start_str or not reasoning_end_str:
            # If we don't have valid strings to tokenize,
            # we can't initialize the token IDs.
            return
        self._reasoning_start_token_ids = tokenizer.encode(
            reasoning_start_str, add_special_tokens=False
        )
        self._reasoning_end_token_ids = tokenizer.encode(
            reasoning_end_str, add_special_tokens=False
        )

        if not self._reasoning_start_token_ids or not self._reasoning_end_token_ids:
            raise ValueError(
                f"ReasoningConfig: failed to tokenize reasoning strings: "
                f"reasoning_start_str='{self.reasoning_start_str}', "
                f"reasoning_end_str='{self.reasoning_end_str}'. "
                "Ensure the strings are valid tokens in the model's vocabulary."
            )
        self._enabled = True

_enabled = field(default=False, init=False, repr=False) class-attribute instance-attribute

Private field indicating whether reasoning token IDs have been initialized. Set to True by initialize_token_ids once token IDs are initialized.

_reasoning_end_token_ids = field(default=None, init=False, repr=False) class-attribute instance-attribute

Private backing field for reasoning_end_token_ids. Set by initialize_token_ids. Not intended to be configured directly.

_reasoning_start_token_ids = field(default=None, init=False, repr=False) class-attribute instance-attribute

Private backing field for reasoning_start_token_ids. Set by initialize_token_ids. Not intended to be configured directly.

enabled property

Returns True if reasoning is enabled (i.e. if token IDs have been initialized), False otherwise.

reasoning_end_str = '' class-attribute instance-attribute

String that indicates the end of reasoning content.

reasoning_end_token_ids property

Token IDs derived from reasoning_end_str. Set automatically by initialize_token_ids. Not intended to be configured directly.

reasoning_parser = '' class-attribute instance-attribute

The name of the ReasoningParser to use for this model.

reasoning_start_str = '' class-attribute instance-attribute

String that indicates the start of reasoning.

reasoning_start_token_ids property

Token IDs derived from reasoning_start_str. Set automatically by initialize_token_ids. Not intended to be configured directly.

initialize_token_ids(model_config)

Initialize reasoning token IDs from strings using the tokenizer.

Source code in vllm/config/reasoning.py
def initialize_token_ids(self, model_config: ModelConfig) -> None:
    """Initialize reasoning token IDs from strings using the tokenizer."""
    if (
        self._reasoning_start_token_ids is not None
        and self._reasoning_end_token_ids is not None
    ):
        self._enabled = True
        return  # Already initialized

    tokenizer = cached_tokenizer_from_config(model_config=model_config)
    reasoning_start_str = self.reasoning_start_str
    reasoning_end_str = self.reasoning_end_str
    if self.reasoning_parser is not None and (
        not reasoning_start_str or not reasoning_end_str
    ):
        parser_cls = ReasoningParserManager.get_reasoning_parser(
            self.reasoning_parser
        )
        reasoning_parser = parser_cls(tokenizer)
        start_token = reasoning_parser.reasoning_start_str
        if start_token and not reasoning_start_str:
            reasoning_start_str = start_token

        end_token = reasoning_parser.reasoning_end_str
        if end_token and not reasoning_end_str:
            reasoning_end_str = end_token

    if not reasoning_start_str or not reasoning_end_str:
        # If we don't have valid strings to tokenize,
        # we can't initialize the token IDs.
        return
    self._reasoning_start_token_ids = tokenizer.encode(
        reasoning_start_str, add_special_tokens=False
    )
    self._reasoning_end_token_ids = tokenizer.encode(
        reasoning_end_str, add_special_tokens=False
    )

    if not self._reasoning_start_token_ids or not self._reasoning_end_token_ids:
        raise ValueError(
            f"ReasoningConfig: failed to tokenize reasoning strings: "
            f"reasoning_start_str='{self.reasoning_start_str}', "
            f"reasoning_end_str='{self.reasoning_end_str}'. "
            "Ensure the strings are valid tokens in the model's vocabulary."
        )
    self._enabled = True

SchedulerConfig

Scheduler configuration.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

  • default_factory

    Factory method to create SchedulerConfig with default values for InitVars.

Attributes:

Source code in vllm/config/scheduler.py
@config
class SchedulerConfig:
    """Scheduler configuration."""

    max_model_len: InitVar[int]
    """Maximum length of a sequence (including prompt and generated text).

    Note: This is stored in the ModelConfig, and is used only here to
    provide fallbacks and validate other attributes."""

    is_encoder_decoder: InitVar[bool]
    """True if the model is an encoder-decoder model.

    Note: This is stored in the ModelConfig, and is used only here to
    disable chunked prefill and prefix caching for encoder-decoder models.
    """

    DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
    DEFAULT_MAX_NUM_BATCHED_TOKENS_FOR_BATCHED_DP: ClassVar[int] = 256
    DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128

    runner_type: RunnerType = "generate"
    """The runner type to launch for the model."""

    max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
    """Maximum number of tokens that can be processed in a single iteration.

    The default value here is mainly for convenience when testing.
    In real usage, this should be set in `EngineArgs.create_engine_config`.
    """

    max_num_scheduled_tokens: int | None = Field(default=None, ge=0)
    """Maximum number of tokens that the scheduler may issue in a single iteration.

    This is usually equal to max_num_batched_tokens, but can be smaller in cases
    when the model might append tokens into the batch (such as speculative decoding).
    Defaults to max_num_batched_tokens."""

    max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
    """Maximum number of sequences to be processed in a single iteration.

    The default value here is mainly for convenience when testing.
    In real usage, this should be set in `EngineArgs.create_engine_config`.
    """

    max_num_partial_prefills: int = Field(default=1, ge=1)
    """For chunked prefill, the maximum number of sequences that can be
    partially prefilled concurrently."""

    max_long_partial_prefills: int = Field(default=1, ge=1)
    """For chunked prefill, the maximum number of prompts longer than
    long_prefill_token_threshold that will be prefilled concurrently. Setting
    this less than max_num_partial_prefills will allow shorter prompts to jump
    the queue in front of longer prompts in some cases, improving latency."""

    long_prefill_token_threshold: int = Field(default=0, ge=0)
    """For chunked prefill, a request is considered long if the prompt is
    longer than this number of tokens."""

    enable_chunked_prefill: bool = True
    """If True, prefill requests can be chunked based
    on the remaining `max_num_batched_tokens`.

    The default value here is mainly for convenience when testing.
    In real usage, this should be set in `EngineArgs.create_engine_config`.
    """

    is_multimodal_model: bool = False
    """True if the model is multimodal."""

    # TODO (ywang96): Make this configurable.
    max_num_encoder_input_tokens: int = Field(init=False)
    """Multimodal encoder compute budget, only used in V1.

    NOTE: This is not currently configurable. It will be overridden by
    max_num_batched_tokens in case max multimodal embedding size is larger."""

    # TODO (ywang96): Make this configurable.
    encoder_cache_size: int = Field(init=False)
    """Multimodal encoder cache size, only used in V1.

    NOTE: This is not currently configurable. It will be overridden by
    max_num_batched_tokens in case max multimodal embedding size is larger."""

    policy: SchedulerPolicy = "fcfs"
    """The scheduling policy to use:

    - "fcfs" means first come first served, i.e. requests are handled in order 
      of arrival.
    - "priority" means requests are handled based on given priority (lower
      value means earlier handling) and time of arrival deciding any ties)."""

    disable_chunked_mm_input: bool = False
    """If set to true and chunked prefill is enabled, we do not want to
    partially schedule a multimodal item. Only used in V1
    This ensures that if a request has a mixed prompt
    (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
    some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
    it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""

    # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
    # (default) or "mod.custom_class".
    scheduler_cls: str | type[object] | None = None
    """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
    the default scheduler. Can be a class directly or the path to a class of
    form "mod.custom_class"."""

    disable_hybrid_kv_cache_manager: bool | None = None
    """If set to True, KV cache manager will allocate the same size of KV cache
    for all attention layers even if there are multiple type of attention layers
    like full attention and sliding window attention.
    If set to None, the default value will be determined based on the environment
    and starting configuration.
    """

    scheduler_reserve_full_isl: bool = True
    """If True, the scheduler checks whether the full input sequence length
    fits in the KV cache before admitting a new request, rather than only
    checking the first chunk. Prevents over-admission and KV cache thrashing
    with chunked prefill."""

    watermark: float = Field(default=0.0, ge=0.0, lt=1.0)
    """Fraction of total KV cache blocks to keep free (the watermark) when
    admitting waiting or preempted requests into the running queue. This headroom
    helps avoid frequent KV cache eviction and the resulting repeated preemption
    of requests when GPU memory is scarce. Must be in the range [0.0, 1.0); 0.0
    (the default) disables the watermark."""

    prefill_schedule_interval: int = Field(default=1, ge=1)
    """For data-parallel deployments, only admit new prefill requests
    once every N engine steps, aligned across DP ranks, to better balance
    per-step forward-pass times."""

    async_scheduling: bool | None = None
    """If set to False, disable async scheduling. Async scheduling helps to
    avoid gaps in GPU utilization, leading to better latency and throughput.
    """

    stream_interval: int = Field(default=1, ge=1)
    """The interval (or buffer size) for streaming in terms of token length.
    A smaller value (1) makes streaming smoother by sending each token immediately,
    while a larger value (e.g., 10) reduces host overhead and may increase throughput
    by batching multiple tokens before sending."""

    @staticmethod
    def default_factory(**kwargs):
        """
        Factory method to create `SchedulerConfig` with default values for `InitVar`s.
        """
        if "max_model_len" not in kwargs:
            kwargs["max_model_len"] = 8192
        if "is_encoder_decoder" not in kwargs:
            kwargs["is_encoder_decoder"] = False
        return SchedulerConfig(**kwargs)

    def get_scheduler_cls(self) -> type["SchedulerInterface"]:
        if self.scheduler_cls is None:
            if self.async_scheduling:
                from vllm.v1.core.sched.async_scheduler import AsyncScheduler

                return AsyncScheduler
            from vllm.v1.core.sched.scheduler import Scheduler

            return Scheduler

        # The first half of this warning can be removed once the Scheduler interface is
        # finalized and we can maintain support for scheduler classes that implement it
        logger.warning_once(
            "Using custom scheduler class %s. This scheduler interface is not public "
            "and compatibility may not be maintained. If you have subclassed Scheduler "
            "instead of AsyncScheduler, you will see degraded performance due to async "
            "scheduling being disabled.",
            self.scheduler_cls,  # type: ignore[arg-type]
        )
        if not isinstance(self.scheduler_cls, str):
            return cast(type["SchedulerInterface"], self.scheduler_cls)
        return resolve_obj_by_qualname(self.scheduler_cls)

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        factors: list[Any] = []

        # max_num_batched_tokens need to be included in the hash due
        # to two reasons:
        # 1. LoRA creates static buffers based on max_num_batched_tokens.
        #   The tensor sizes and strides get captured in the torch.compile
        #   graph explicitly.
        # 2. Inductor decides whether using 32-bit or 64-bit indexing integer
        #   based on the data sizes. `max_num_batched_tokens` has an
        #   impact on that. For more details, please check
        #   https://github.com/vllm-project/vllm/issues/29585
        factors.append(self.max_num_batched_tokens)

        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialisation is delayed."""
        return None if value is None else handler(value)

    def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
        if is_encoder_decoder:
            # Chunked prefill should be disabled for encoder-decoder models.
            self.disable_chunked_mm_input = True
            self.enable_chunked_prefill = False
            self.long_prefill_token_threshold = 0
            logger.info(
                "Encoder-decoder models do not support chunked prefill nor"
                " prefix caching; disabling both."
            )

        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
        self.encoder_cache_size = self.max_num_batched_tokens

        if self.enable_chunked_prefill:
            logger.info_once(
                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                self.max_num_batched_tokens,
            )

        if self.max_num_partial_prefills > 1:
            if self.long_prefill_token_threshold == 0:
                self.long_prefill_token_threshold = int(max_model_len * 0.04)

            logger.info(
                "Concurrent partial prefills enabled with "
                "max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
                "long_prefill_token_threshold=%d",
                self.max_num_partial_prefills,
                self.max_long_partial_prefills,
                self.long_prefill_token_threshold,
            )

        self.verify_max_model_len(max_model_len)

    def verify_max_model_len(self, max_model_len: int) -> Self:
        if (
            self.max_num_batched_tokens < max_model_len
            and not self.enable_chunked_prefill
        ):
            raise ValueError(
                f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
                f"smaller than max_model_len ({max_model_len}). "
                "This effectively limits the maximum sequence length to "
                "max_num_batched_tokens and makes vLLM reject longer "
                "sequences. Please increase max_num_batched_tokens or "
                "decrease max_model_len."
            )

        if self.max_num_batched_tokens < self.max_num_seqs:
            raise ValueError(
                f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
                "be greater than or equal to max_num_seqs "
                f"({self.max_num_seqs})."
            )

        if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
            logger.warning(
                "max_num_batched_tokens (%d) exceeds max_num_seqs "
                "* max_model_len (%d). This may lead to unexpected behavior.",
                self.max_num_batched_tokens,
                self.max_num_seqs * max_model_len,
            )

        if self.max_num_partial_prefills > 1:
            if not self.enable_chunked_prefill:
                raise ValueError(
                    "Chunked prefill must be enabled to set "
                    "max_num_partial_prefills > 1."
                )

            if self.long_prefill_token_threshold > max_model_len:
                raise ValueError(
                    "long_prefill_token_threshold "
                    f"({self.long_prefill_token_threshold}) cannot be greater "
                    f"than the max_model_len ({max_model_len})."
                )

        if self.max_long_partial_prefills > self.max_num_partial_prefills:
            raise ValueError(
                f"{self.max_long_partial_prefills=} must be less than or equal to "
                f"{self.max_num_partial_prefills=}."
            )

        return self

async_scheduling = None class-attribute instance-attribute

If set to False, disable async scheduling. Async scheduling helps to avoid gaps in GPU utilization, leading to better latency and throughput.

disable_chunked_mm_input = False class-attribute instance-attribute

If set to true and chunked prefill is enabled, we do not want to partially schedule a multimodal item. Only used in V1 This ensures that if a request has a mixed prompt (like text tokens TTTT followed by image tokens IIIIIIIIII) where only some image tokens can be scheduled (like TTTTIIIII, leaving IIIII), it will be scheduled as TTTT in one step and IIIIIIIIII in the next.

disable_hybrid_kv_cache_manager = None class-attribute instance-attribute

If set to True, KV cache manager will allocate the same size of KV cache for all attention layers even if there are multiple type of attention layers like full attention and sliding window attention. If set to None, the default value will be determined based on the environment and starting configuration.

enable_chunked_prefill = True class-attribute instance-attribute

If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.

The default value here is mainly for convenience when testing. In real usage, this should be set in EngineArgs.create_engine_config.

encoder_cache_size = Field(init=False) class-attribute instance-attribute

Multimodal encoder cache size, only used in V1.

NOTE: This is not currently configurable. It will be overridden by max_num_batched_tokens in case max multimodal embedding size is larger.

is_multimodal_model = False class-attribute instance-attribute

True if the model is multimodal.

long_prefill_token_threshold = Field(default=0, ge=0) class-attribute instance-attribute

For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.

max_long_partial_prefills = Field(default=1, ge=1) class-attribute instance-attribute

For chunked prefill, the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. Setting this less than max_num_partial_prefills will allow shorter prompts to jump the queue in front of longer prompts in some cases, improving latency.

max_num_batched_tokens = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1) class-attribute instance-attribute

Maximum number of tokens that can be processed in a single iteration.

The default value here is mainly for convenience when testing. In real usage, this should be set in EngineArgs.create_engine_config.

max_num_encoder_input_tokens = Field(init=False) class-attribute instance-attribute

Multimodal encoder compute budget, only used in V1.

NOTE: This is not currently configurable. It will be overridden by max_num_batched_tokens in case max multimodal embedding size is larger.

max_num_partial_prefills = Field(default=1, ge=1) class-attribute instance-attribute

For chunked prefill, the maximum number of sequences that can be partially prefilled concurrently.

max_num_scheduled_tokens = Field(default=None, ge=0) class-attribute instance-attribute

Maximum number of tokens that the scheduler may issue in a single iteration.

This is usually equal to max_num_batched_tokens, but can be smaller in cases when the model might append tokens into the batch (such as speculative decoding). Defaults to max_num_batched_tokens.

max_num_seqs = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1) class-attribute instance-attribute

Maximum number of sequences to be processed in a single iteration.

The default value here is mainly for convenience when testing. In real usage, this should be set in EngineArgs.create_engine_config.

policy = 'fcfs' class-attribute instance-attribute

The scheduling policy to use:

  • "fcfs" means first come first served, i.e. requests are handled in order of arrival.
  • "priority" means requests are handled based on given priority (lower value means earlier handling) and time of arrival deciding any ties).

prefill_schedule_interval = Field(default=1, ge=1) class-attribute instance-attribute

For data-parallel deployments, only admit new prefill requests once every N engine steps, aligned across DP ranks, to better balance per-step forward-pass times.

runner_type = 'generate' class-attribute instance-attribute

The runner type to launch for the model.

scheduler_cls = None class-attribute instance-attribute

The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is the default scheduler. Can be a class directly or the path to a class of form "mod.custom_class".

scheduler_reserve_full_isl = True class-attribute instance-attribute

If True, the scheduler checks whether the full input sequence length fits in the KV cache before admitting a new request, rather than only checking the first chunk. Prevents over-admission and KV cache thrashing with chunked prefill.

stream_interval = Field(default=1, ge=1) class-attribute instance-attribute

The interval (or buffer size) for streaming in terms of token length. A smaller value (1) makes streaming smoother by sending each token immediately, while a larger value (e.g., 10) reduces host overhead and may increase throughput by batching multiple tokens before sending.

watermark = Field(default=0.0, ge=0.0, lt=1.0) class-attribute instance-attribute

Fraction of total KV cache blocks to keep free (the watermark) when admitting waiting or preempted requests into the running queue. This headroom helps avoid frequent KV cache eviction and the resulting repeated preemption of requests when GPU memory is scarce. Must be in the range [0.0, 1.0); 0.0 (the default) disables the watermark.

_skip_none_validation(value, handler) classmethod

Skip validation if the value is None when initialisation is delayed.

Source code in vllm/config/scheduler.py
@field_validator("scheduler_cls", "async_scheduling", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
    """Skip validation if the value is `None` when initialisation is delayed."""
    return None if value is None else handler(value)

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/scheduler.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    factors: list[Any] = []

    # max_num_batched_tokens need to be included in the hash due
    # to two reasons:
    # 1. LoRA creates static buffers based on max_num_batched_tokens.
    #   The tensor sizes and strides get captured in the torch.compile
    #   graph explicitly.
    # 2. Inductor decides whether using 32-bit or 64-bit indexing integer
    #   based on the data sizes. `max_num_batched_tokens` has an
    #   impact on that. For more details, please check
    #   https://github.com/vllm-project/vllm/issues/29585
    factors.append(self.max_num_batched_tokens)

    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

default_factory(**kwargs) staticmethod

Factory method to create SchedulerConfig with default values for InitVars.

Source code in vllm/config/scheduler.py
@staticmethod
def default_factory(**kwargs):
    """
    Factory method to create `SchedulerConfig` with default values for `InitVar`s.
    """
    if "max_model_len" not in kwargs:
        kwargs["max_model_len"] = 8192
    if "is_encoder_decoder" not in kwargs:
        kwargs["is_encoder_decoder"] = False
    return SchedulerConfig(**kwargs)

SpeculativeConfig

Configuration for speculative decoding.

Methods:

Attributes:

Source code in vllm/config/speculative.py
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
@config
class SpeculativeConfig:
    """Configuration for speculative decoding."""

    enforce_eager: bool | None = None
    """Override the default enforce_eager from model_config"""
    # General speculative decoding control
    num_speculative_tokens: int = Field(default=None, gt=0)  # type: ignore[assignment]
    """The number of speculative tokens, if provided. It will default to the
    number in the draft model config if present, otherwise, it is required."""
    model: str | None = None
    """The name of the draft model, eagle head, or additional weights, if
    provided."""
    method: SpeculativeMethod | None = None
    """The name of the speculative method to use. If users provide and set the
    `model` param, the speculative method type will be detected automatically
    if possible, if `model` param is not provided, the method name must be
    provided.

    If using `ngram` method, the related configuration `prompt_lookup_max` and
    `prompt_lookup_min` should be considered."""
    draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
    """The degree of the tensor parallelism for the draft model. Can only be 1
    or the same as the target model's tensor parallel size."""
    tensor_parallel_size: int | None = None
    """Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
    warn users when they mistakenly provide the wrong argument."""

    # Draft model configuration
    quantization: me_quant.QuantizationMethods | str | None = None
    """Quantization method that was used to quantize the draft model weights.
    If `None`, we assume the model weights are not quantized. Note that it only
    takes effect when using the draft model-based speculative method."""
    moe_backend: MoEBackend | None = None
    """MoE backend to use for the draft model. When `None`, the draft model
    inherits the target model's `--moe-backend` setting. Useful when the
    drafter and generator require different MoE kernels (e.g. quantized
    generator with unquantized drafter)."""
    attention_backend: AttentionBackendEnum | None = None
    """Attention backend to use for the draft model. When `None`, the backend is
    automatically selected. Useful when the drafter requires a different attention
    backend (e.g. DFlash needs a non-causal-capable backend like FLASH_ATTN)."""
    max_model_len: int | None = Field(default=None, ge=1)
    """The maximum model length of the draft model. Used when testing the
    ability to skip speculation for some sequences."""
    revision: str | None = None
    """The specific model version to use for the draft model. It can be a
    branch name, a tag name, or a commit id. If unspecified, will use the
    default version."""
    code_revision: str | None = None
    """The specific revision to use for the draft model code on Hugging Face
    Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
    will use the default version."""

    # Advanced control
    disable_padded_drafter_batch: bool = False
    """Disable input padding for speculative decoding. If set to True,
    speculative input batches can contain sequences of different lengths,
    which may only be supported by certain attention backends. This currently
    only affects the EAGLE method of speculation."""
    use_local_argmax_reduction: bool = False
    """Use vocab-parallel local argmax instead of all-gathering full logits
    for draft token generation. Reduces communication from O(vocab_size) to
    O(2 * tp_size) per token. Only applies to greedy draft selection in
    non-tree speculation."""

    # Ngram proposer configuration
    prompt_lookup_max: int | None = Field(default=None, ge=1)
    """Maximum size of ngram token window when using Ngram proposer, required
    when method is set to ngram."""
    prompt_lookup_min: int | None = Field(default=None, ge=1)
    """Minimum size of ngram token window when using Ngram proposer, if
    provided. Defaults to 1."""

    # Alternative drafting strategies
    parallel_drafting: bool = False
    """Enable parallel drafting, where all speculative tokens are generated
    in parallel rather than sequentially. This can improve performance but
    requires the speculative model be trained to support parallel drafting.
    Only compatible with EAGLE and draft model methods."""

    # required configuration params passed from engine
    target_model_config: SkipValidation[ModelConfig] = None  # type: ignore
    """The configuration of the target model."""
    target_parallel_config: SkipValidation[ParallelConfig] = None  # type: ignore
    """The parallel configuration for the target model."""

    # dynamic speculative decoding control
    num_speculative_tokens_per_batch_size: list[tuple[int, int, int]] | None = None
    """Batch-size schedule used to dynamically choose speculative-token count.

    Each entry is ``(range_start, range_end, num_speculative_tokens)`` with an
    inclusive batch-size range.
    """

    # params generated in the post-init stage
    draft_model_config: SkipValidation[ModelConfig] = None  # type: ignore
    """The configuration of the draft model initialized internal."""
    draft_parallel_config: SkipValidation[ParallelConfig] = None  # type: ignore
    """The parallel configuration for the draft model initialized internal."""

    # Suffix decoding configuration
    suffix_decoding_max_tree_depth: int = 24
    """The maximum depth of the suffix decoding global and prompt trees. The
    tree depth limits the sum of the prefix match and speculation lengths."""

    suffix_decoding_max_cached_requests: int = 10000
    """The maximum number of requests to cache in the global suffix tree. If
    exceeded, will trigger eviction in FIFO order. If set to 0, the global
    suffix tree is disabled and past responses are not cached (prompt trees
    are still used)."""

    suffix_decoding_max_spec_factor: float = 1.0
    """The maximum spec factor for suffix decoding. The spec factor controls
    speculation lengths based on the prefix match length: max_spec_tokens =
    max_spec_factor * prefix_match_length."""

    suffix_decoding_min_token_prob: float = 0.1
    """The minimum token probability for suffix decoding. Will only speculate
    tokens with estimated probability (based on frequency counts) greater than
    or equal to this value."""

    draft_load_config: LoadConfig | None = None
    """Load config for the draft model. If not specified, will use the load
    config from the target model."""

    rejection_sample_method: RejectionSampleMethod = "standard"
    """The rejection sampling method to use. 'standard' uses probabilistic
    rejection sampling (with or without cached draft logits, controlled by
    draft_sample_method). 'synthetic' accepts draft tokens with a decaying
    probability calibrated to synthetic_acceptance_rate."""

    synthetic_acceptance_rates: list[float] | None = None
    """Per-position *unconditional* acceptance rates for synthetic rejection
    sampling. Position i's entry is the marginal probability that the first
    i+1 draft tokens are all accepted; the list must have length
    num_speculative_tokens, each entry in [0, 1], and be monotonically
    non-increasing. Only valid when rejection_sample_method is 'synthetic'.
    Mutually exclusive with synthetic_acceptance_length."""

    synthetic_acceptance_length: float | None = None
    """Target mean acceptance length for synthetic rejection sampling, in
    [1, num_speculative_tokens + 1]. Resolved internally to
    synthetic_acceptance_rates. Only valid when rejection_sample_method is 'synthetic'.
    Mutually exclusive with synthetic_acceptance_rates."""

    @staticmethod
    def _acceptance_length_to_rates(length: float, n: int) -> list[float]:
        """Mean acceptance length to unconditional per-position rates, using
        the minimum-variance schedule."""
        num_drafts = length - 1  # expected number of accepted draft tokens
        num_full = int(num_drafts)
        return (
            [1.0] * num_full + [num_drafts - num_full] + [0.0] * (n - num_full - 1)
        )[:n]

    @staticmethod
    def _resolve_synthetic_acceptance_rates(
        n: int,
        rates: list[float] | None,
        length: float | None,
    ) -> list[float]:
        """Return per-position unconditional acceptance rates from exactly one
        of `rates` or `length` (validates range, length, and monotonicity)."""
        if (rates is None) == (length is None):
            raise ValueError(
                "rejection_sample_method='synthetic' requires exactly one of "
                "synthetic_acceptance_rates or synthetic_acceptance_length."
            )
        if rates is not None:
            if len(rates) != n:
                raise ValueError(
                    f"synthetic_acceptance_rates must have length {n}, got {rates}."
                )
            if not all(0.0 <= r <= 1.0 for r in rates):
                raise ValueError(
                    f"synthetic_acceptance_rates entries must be in [0, 1], "
                    f"got {rates}."
                )
            if any(rates[i] > rates[i - 1] for i in range(1, n)):
                raise ValueError(
                    f"synthetic_acceptance_rates must be non-increasing, got {rates}."
                )
            return list(rates)
        assert length is not None
        if not 1.0 <= length <= float(n + 1):
            raise ValueError(
                f"synthetic_acceptance_length must be in [1, {n + 1}], got {length}."
            )
        return SpeculativeConfig._acceptance_length_to_rates(length, n)

    draft_sample_method: DraftSampleMethod = "greedy"
    """How the draft model samples tokens. 'greedy' always picks the argmax
    token, and the draft probabilities are treated as one-hot during rejection
    sampling. 'probabilistic' samples stochastically from the draft
    distribution and uses the full draft logits for the probability ratio test
    during rejection sampling. This comes at the cost of additional GPU memory
    usage."""

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        factors: list[Any] = []
        # Eagle3 and extract_hidden_states affect the computation graph because
        # they return intermediate hidden states in addition to the final hidden state.
        uses_aux_hidden_states = self.method in (
            "eagle3",
            "extract_hidden_states",
            "dflash",
        )
        factors.append(uses_aux_hidden_states)

        # The specific layers used also affect the computation graph
        if uses_aux_hidden_states and self.draft_model_config is not None:
            layer_ids = getattr(
                self.draft_model_config.hf_config,
                "eagle_aux_hidden_state_layer_ids",
                None,
            )
            if layer_ids is not None:
                # Convert to tuple to make it hashable
                factors.append(tuple(layer_ids))

        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @staticmethod
    def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
        initial_architecture = hf_config.architectures[0]
        if hf_config.model_type in (
            "deepseek_v3",
            "deepseek_v32",
            "glm_moe_dsa",
        ):
            hf_config.model_type = "deepseek_mtp"
        if hf_config.model_type == "deepseek_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]}
            )
        if hf_config.model_type == "deepseek_v4":
            hf_config.model_type = "deepseek_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["DeepSeekV4MTPModel"]}
            )
        if hf_config.model_type in ("pangu_ultra_moe"):
            hf_config.model_type = "pangu_ultra_moe_mtp"
        if hf_config.model_type == "pangu_ultra_moe_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["OpenPanguMTPModel"]}
            )

        if hf_config.architectures[0] == "MiMoForCausalLM":
            hf_config.model_type = "mimo_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {
                    "num_hidden_layers": 0,
                    "n_predict": n_predict,
                    "architectures": ["MiMoMTPModel"],
                }
            )

        if (arch := hf_config.architectures[0]) in (
            "MiMoV2ForCausalLM",
            "MiMoV2OmniForCausalLM",
        ):
            from vllm.model_executor.models.mimo_v2_mtp import (
                _MIMO_V2_PRO_NUM_MTP_LAYERS,
            )

            mtp_arch_maps = {
                "MiMoV2ForCausalLM": "MiMoV2MTPModel",
                "MiMoV2OmniForCausalLM": "MiMoV2OmniMTPModel",
            }

            hf_config.model_type = "mimo_v2_mtp"
            # vLLM currently supports only the first MiMo-V2 MTP layer.
            n_predict = _MIMO_V2_PRO_NUM_MTP_LAYERS
            hf_config.update(
                {
                    "num_hidden_layers": 0,
                    "n_predict": n_predict,
                    "num_nextn_predict_layers": n_predict,
                    "architectures": [mtp_arch_maps[arch]],
                }
            )

        if hf_config.architectures[0] == "MiMoV2FlashForCausalLM":
            from vllm.model_executor.models.mimo_v2_mtp import (
                _MIMO_V2_FLASH_NUM_MTP_LAYERS,
            )

            hf_config.model_type = "mimo_v2_mtp"
            # vLLM currently supports only the first MiMo-V2 MTP layer.
            n_predict = _MIMO_V2_FLASH_NUM_MTP_LAYERS
            hf_config.update(
                {
                    "num_hidden_layers": 0,
                    "n_predict": n_predict,
                    "num_nextn_predict_layers": n_predict,
                    "architectures": ["MiMoV2MTPModel"],
                }
            )

        if hf_config.architectures[0] == "Glm4MoeForCausalLM":
            hf_config.model_type = "glm4_moe_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {
                    "n_predict": n_predict,
                    "architectures": ["Glm4MoeMTPModel"],
                }
            )

        if hf_config.architectures[0] == "Glm4MoeLiteForCausalLM":
            hf_config.model_type = "glm4_moe_lite_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {
                    "num_hidden_layers": 0,
                    "n_predict": n_predict,
                    "architectures": ["Glm4MoeLiteMTPModel"],
                }
            )

        if hf_config.architectures[0] == "GlmOcrForConditionalGeneration":
            hf_config.model_type = "glm_ocr_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {
                    "num_hidden_layers": 0,
                    "n_predict": n_predict,
                    "architectures": ["GlmOcrMTPModel"],
                }
            )

        if hf_config.model_type == "ernie4_5_moe":
            hf_config.model_type = "ernie_mtp"
        if hf_config.model_type == "ernie_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["ErnieMTPModel"]}
            )

        if hf_config.architectures[0] == "NemotronH_Super_Omni_Reasoning_V3":
            # Promote VLM's text_config so MTP detection below fires correctly
            hf_config = hf_config.text_config

        if (
            hf_config.model_type in {"nemotron_h", "nemotron_h_puzzle"}
            and hasattr(hf_config, "num_nextn_predict_layers")
            and hf_config.num_nextn_predict_layers > 0
        ):
            # Check if this is an MTP variant
            hf_config.model_type = "nemotron_h_mtp"
        if hf_config.model_type == "nemotron_h_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["NemotronHMTPModel"]}
            )

        if hf_config.model_type == "qwen3_next":
            hf_config.model_type = "qwen3_next_mtp"
        if hf_config.model_type == "qwen3_next_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]}
            )

        if hf_config.model_type == "exaone_moe":
            hf_config.model_type = "exaone_moe_mtp"
        if hf_config.model_type == "exaone_moe_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["ExaoneMoeMTP"]}
            )
        if "exaone4_5" in hf_config.model_type:
            hf_config.model_type = "exaone4_5_mtp"
        if hf_config.model_type == "exaone4_5_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["Exaone4_5_MTP"]}
            )
        if hf_config.model_type in ("qwen3_5", "qwen3_5_moe"):
            is_moe = hf_config.model_type == "qwen3_5_moe"
            hf_config.model_type = "qwen3_5_mtp"
            n_predict = getattr(hf_config, "mtp_num_hidden_layers", None)
            hf_config.update(
                {
                    "n_predict": n_predict,
                    "architectures": ["Qwen3_5MoeMTP" if is_moe else "Qwen3_5MTP"],
                }
            )
        if hf_config.model_type == "intern_s2_preview":
            text_config = getattr(hf_config, "text_config", None)
            is_moe = getattr(text_config, "model_type", None) == "qwen3_5_moe_text"
            hf_config.model_type = "qwen3_5_mtp"
            n_predict = getattr(text_config, "mtp_num_hidden_layers", None)
            hf_config.update(
                {
                    "n_predict": n_predict,
                    "architectures": ["Qwen3_5MoeMTP" if is_moe else "Qwen3_5MTP"],
                }
            )
        if hf_config.model_type == "longcat_flash":
            hf_config.model_type = "longcat_flash_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]}
            )

        if hf_config.model_type in ("step3p5", "step3p7") or hf_config.architectures[
            0
        ] in ("Step3p5ForCausalLM", "Step3p7ForConditionalGeneration"):
            quantization_config = getattr(hf_config, "quantization_config", None)
            hf_config = getattr(hf_config, "text_config", hf_config)
            if (
                quantization_config is not None
                and getattr(hf_config, "quantization_config", None) is None
            ):
                hf_config.update({"quantization_config": quantization_config})
            hf_config.model_type = "step3p5_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
            hf_config.update({"n_predict": n_predict, "architectures": ["Step3p5MTP"]})

        if initial_architecture == "MistralLarge3ForCausalLM":
            hf_config.update({"architectures": ["EagleMistralLarge3ForCausalLM"]})

        if hf_config.model_type == "hy_v3":
            hf_config.model_type = "hy_v3_mtp"
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["HYV3MTPModel"]}
            )

        if hf_config.model_type in ("gemma4_assistant", "gemma4_unified_assistant"):
            hf_config.model_type = "gemma4_mtp"
            text_config = getattr(hf_config, "text_config", hf_config)
            # The assistant runs all decoder layers in a single forward
            # call to produce one draft token, so n_predict=1.
            # num_kv_shared_layers must be 0: cross-model KV sharing is
            # set up by the proposer after model construction.
            if hasattr(text_config, "num_kv_shared_layers"):
                text_config.num_kv_shared_layers = 0
            hf_config.update({"n_predict": 1, "architectures": ["Gemma4MTPModel"]})

        if (
            hf_config.model_type == "minimax_m3_vl"
            or initial_architecture == "MiniMaxM3SparseForConditionalGeneration"
        ):
            # MTP modules live on the language model of this VL checkpoint, so
            # promote text_config before rewriting it into an MTP config.
            quantization_config = getattr(hf_config, "quantization_config", None)
            hf_config = getattr(hf_config, "text_config", hf_config)
            if (
                quantization_config is not None
                and getattr(hf_config, "quantization_config", None) is None
            ):
                hf_config.update({"quantization_config": quantization_config})
            hf_config.model_type = "minimax_m3_mtp"
            n_predict = getattr(hf_config, "num_mtp_modules", 1)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["MiniMaxM3MTP"]}
            )
        elif (
            hf_config.model_type == "minimax_m3_mtp"
            or initial_architecture == "MiniMaxM3MTP"
        ):
            # Standalone MTP checkpoints already use a flat MTP config with no
            # VL wrapper / text_config to promote, so just normalize the
            # architecture and derive n_predict from num_mtp_modules.
            n_predict = getattr(hf_config, "num_mtp_modules", 1)
            hf_config.update(
                {"n_predict": n_predict, "architectures": ["MiniMaxM3MTP"]}
            )

        return hf_config

    def __post_init__(self):
        # Note: "method" is a new parameter that helps to extend the
        # configuration of non-model-based proposers, and the "model" parameter
        # will be used to set the draft model, eagle head, or additional weight
        # when needed. If users do not specify "method", the speculative method
        # will be detected automatically if possible. If the speculative method
        # can not be detected, it will be considered as the "draft_model" by
        # default.

        # infer method from user args
        # Check if the model field contains a custom module path (e.g., 'pkg.Mod')
        if (
            self.model is not None
            and "." in self.model
            and not self.model.startswith(("http://", "https://", "file://"))
            and "/" not in self.model  # not a HuggingFace repo (org/model)
        ):
            # Treat as a custom class path
            self.method = "custom_class"
        elif self.method is None:
            if self.model in ("ngram", "[ngram]"):
                self.method = "ngram"
            else:
                self.method = "draft_model"

        if self.method in get_args(MTPModelTypes) and self.method != "mtp":
            logger.warning(
                "method `%s` is deprecated and replaced with mtp.", self.method
            )
            self.method = "mtp"

        if self.model is None and self.num_speculative_tokens is not None:
            if self.method == "mtp":
                if self.target_model_config is None:
                    raise ValueError("target_model_config must be present for mtp")
                if self.target_model_config.hf_text_config.model_type == "deepseek_v32":
                    # FIXME(luccafong): cudagraph with v32 MTP is not supported,
                    # remove this when the issue is fixed.
                    self.enforce_eager = True
                # use the draft model from the same model:
                self.model = self.target_model_config.model
                # Align the quantization of draft model for cases such as
                # --quantization fp8 with a bf16 checkpoint.
                if not self.quantization:
                    self.quantization = self.target_model_config.quantization
            elif self.method in ("ngram", "[ngram]"):
                self.model = "ngram"
            elif self.method == "ngram_gpu":
                self.model = "ngram_gpu"
            elif self.method == "suffix":
                self.model = "suffix"
            elif self.method == "extract_hidden_states":
                self.model = "extract_hidden_states"
            elif self.method == "custom_class":
                # method was set explicitly, but model should already contain the
                # custom module path. If not, this is a configuration error.
                if self.model is None:
                    raise ValueError(
                        "method='custom_class' requires 'model' to contain the "
                        "custom proposer module path (e.g., 'my_module.MyProposer')."
                    )
            else:
                raise ValueError(
                    "num_speculative_tokens was provided but without speculative model."
                )

        if self.method in ("ngram", "[ngram]"):
            self.method = "ngram"

        if self.method in ("ngram", "ngram_gpu"):
            # Set default values if not provided
            if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
                # TODO(woosuk): Tune these values. They are arbitrarily chosen.
                self.prompt_lookup_min = 5
                self.prompt_lookup_max = 5
            elif self.prompt_lookup_min is None:
                if self.prompt_lookup_max is None:
                    raise ValueError(
                        "Either prompt_lookup_max or prompt_lookup_min must be "
                        "provided when using the ngram method."
                    )
                self.prompt_lookup_min = self.prompt_lookup_max
            elif self.prompt_lookup_max is None:
                if self.prompt_lookup_min is None:
                    raise ValueError(
                        "Either prompt_lookup_max or prompt_lookup_min must be "
                        "provided when using the ngram method."
                    )
                self.prompt_lookup_max = self.prompt_lookup_min

            # Validate values
            if self.prompt_lookup_min > self.prompt_lookup_max:
                raise ValueError(
                    f"prompt_lookup_min={self.prompt_lookup_min} must "
                    f"be <= prompt_lookup_max={self.prompt_lookup_max}"
                )

            # TODO: current we still need extract vocab_size from target model
            # config, in future, we may try refactor it out, and set
            # draft related config as None here.
            self.draft_model_config = self.target_model_config
            self.draft_parallel_config = self.target_parallel_config
        elif self.method == "suffix":
            self._validate_suffix_decoding()
        elif self.method == "custom_class":
            # Custom class proposer does not need a draft model.
            # It will dynamically load the user-provided class at runtime.
            logger.warning_once(
                "Using a custom class-based proposer backend. This is an "
                "experimental feature and the proposer interface is subject to "
                "breaking changes in future vLLM releases."
            )
            self.prompt_lookup_max = 0
            self.prompt_lookup_min = 0
            self.draft_model_config = self.target_model_config
            self.draft_parallel_config = self.target_parallel_config
        elif self.method == "extract_hidden_states":
            from vllm.transformers_utils.configs.extract_hidden_states import (
                ExtractHiddenStatesConfig,
            )

            # ExtractHiddenStatesModel is instantiated manually in load_model()
            # We just need to store the target model config for KV cache shape info
            self.model = "extract_hidden_states"
            self.prompt_lookup_max = 0
            self.prompt_lookup_min = 0

            if hasattr(self.draft_model_config, "hf_config"):
                hf_config = self.draft_model_config.hf_config.to_dict()
            elif (
                isinstance(self.draft_model_config, dict)
                and "hf_config" in self.draft_model_config
            ):
                hf_config = self.draft_model_config["hf_config"]
            else:
                hf_config = {}

            self.draft_model_config = copy.copy(self.target_model_config)
            self.draft_model_config.hf_config = ExtractHiddenStatesConfig(
                self.draft_model_config.hf_config, **hf_config
            )
            self.update_arch_()
            self.draft_parallel_config = self.target_parallel_config

        else:
            self.prompt_lookup_max = 0
            self.prompt_lookup_min = 0

            if self.model is not None:
                self.draft_model_config = ModelConfig(
                    model=self.model,
                    runner="draft",
                    tokenizer=self.target_model_config.tokenizer,
                    tokenizer_mode=self.target_model_config.tokenizer_mode,
                    trust_remote_code=self.target_model_config.trust_remote_code,
                    allowed_local_media_path=self.target_model_config.allowed_local_media_path,
                    allowed_media_domains=self.target_model_config.allowed_media_domains,
                    dtype=self.target_model_config.dtype,
                    seed=self.target_model_config.seed,
                    revision=self.revision,
                    code_revision=self.code_revision,
                    tokenizer_revision=self.target_model_config.tokenizer_revision,
                    max_model_len=self.max_model_len,  # type: ignore[arg-type]
                    spec_target_max_model_len=self.target_model_config.max_model_len,
                    quantization=self.quantization,
                    enforce_eager=self.target_model_config.enforce_eager,
                    max_logprobs=self.target_model_config.max_logprobs,
                    hf_overrides=SpeculativeConfig.hf_config_override,
                    config_format=self.target_model_config.config_format,
                )

                # Automatically detect the method
                if self.method in ("eagle", "eagle3", "dflash"):
                    pass
                # examples:
                # yuhuili/EAGLE-LLaMA3-Instruct-8B
                # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
                # AngelSlim/Qwen3-8B_eagle3
                elif "eagle-" in self.draft_model_config.model.lower():
                    self.method = "eagle"
                elif "eagle3" in self.draft_model_config.model.lower():
                    self.method = "eagle3"
                elif "dflash" in self.draft_model_config.model.lower():
                    self.method = "dflash"
                elif self.draft_model_config.hf_config.model_type == "medusa":
                    self.method = "medusa"
                elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
                    self.method = "mlp_speculator"
                elif self.draft_model_config.hf_config.model_type in get_args(
                    MTPModelTypes
                ):
                    self.method = "mtp"
                    if (
                        self.num_speculative_tokens > 1
                        and self.draft_model_config.hf_config.model_type
                        != "step3p5_mtp"
                    ):
                        logger.warning(
                            "Enabling num_speculative_tokens > 1 will run "
                            "multiple times of forward on same MTP layer"
                            ",which may result in lower acceptance rate"
                        )
                elif self.method == "draft_model":
                    pass
                else:
                    raise NotImplementedError(
                        f"Unsupported speculative method: '{self.method}'"
                    )

                # Replace hf_config for EAGLE draft_model
                if self.method in ("eagle", "eagle3", "dflash"):
                    from vllm.transformers_utils.configs.eagle import EAGLEConfig
                    from vllm.transformers_utils.configs.speculators import (
                        SpeculatorsConfig,
                    )

                    if isinstance(
                        self.draft_model_config.hf_config,
                        (EAGLEConfig, SpeculatorsConfig),
                    ):
                        pass
                    else:
                        eagle_config = EAGLEConfig(
                            self.draft_model_config.hf_config,
                            method=self.method,
                            model_type="eagle",
                        )
                        self.draft_model_config.hf_config = eagle_config
                        self.update_arch_()

                if self.method == "dflash":
                    self.parallel_drafting = True

                if self.num_speculative_tokens is not None and hasattr(
                    self.draft_model_config.hf_config, "num_lookahead_tokens"
                ):
                    self.draft_model_config.hf_config.num_lookahead_tokens = (
                        self.num_speculative_tokens
                    )

                n_predict = getattr(
                    self.draft_model_config.hf_config, "n_predict", None
                )
                if n_predict is not None:
                    if self.num_speculative_tokens is None:
                        # Default to max value defined in draft model config.
                        self.num_speculative_tokens = n_predict
                    elif (
                        self.num_speculative_tokens > n_predict
                        and self.num_speculative_tokens % n_predict != 0
                    ):
                        # Ensure divisibility for MTP module reuse.
                        raise ValueError(
                            f"num_speculative_tokens:{self.num_speculative_tokens}"
                            f" must be divisible by {n_predict=}"
                        )

                if self.num_speculative_tokens is None:
                    raise ValueError(
                        "A speculative model was provided, but "
                        "`num_speculative_tokens` was not provided"
                    )

                self.draft_tensor_parallel_size = (
                    SpeculativeConfig._verify_and_get_draft_tp(
                        self.target_parallel_config,
                        self.draft_tensor_parallel_size,
                        self.draft_model_config.hf_config,
                    )
                )

                self.draft_model_config.max_model_len = (
                    SpeculativeConfig._maybe_override_draft_max_model_len(
                        self.max_model_len,
                        self.draft_model_config.max_model_len,
                        self.target_model_config.max_model_len,
                    )
                )

                self.draft_parallel_config = (
                    SpeculativeConfig.create_draft_parallel_config(
                        self.target_parallel_config, self.draft_tensor_parallel_size
                    )
                )
        return self

    def _validate_suffix_decoding(self):
        if not has_arctic_inference():
            raise ImportError(
                "Arctic Inference is required for suffix decoding. "
                "Install via `pip install arctic-inference==0.1.1`."
            )
        if self.num_speculative_tokens is None:
            # Suffix decoding decides the actual number of speculative tokens
            # dynamically and treats num_speculative_tokens as a maximum limit.
            self.num_speculative_tokens = self.suffix_decoding_max_tree_depth
            logger.warning(
                "Defaulted num_speculative_tokens to %s for suffix decoding.",
                self.num_speculative_tokens,
            )
        # Validate values
        if self.suffix_decoding_max_tree_depth < 1:
            raise ValueError(
                f"suffix_decoding_max_tree_depth="
                f"{self.suffix_decoding_max_tree_depth} must be >= 1"
            )
        if self.suffix_decoding_max_cached_requests < 0:
            raise ValueError(
                f"suffix_decoding_max_cached_requests="
                f"{self.suffix_decoding_max_cached_requests} must be >= 0"
            )
        if self.suffix_decoding_max_spec_factor < 0:
            raise ValueError(
                f"suffix_decoding_max_spec_factor="
                f"{self.suffix_decoding_max_spec_factor} must be >= 0"
            )
        if not 0 <= self.suffix_decoding_min_token_prob <= 1:
            raise ValueError(
                f"suffix_decoding_min_token_prob="
                f"{self.suffix_decoding_min_token_prob} must be in [0, 1]"
            )

    @staticmethod
    def _maybe_override_draft_max_model_len(
        speculative_max_model_len: int | None,
        draft_max_model_len: int,
        target_max_model_len: int,
    ) -> int:
        """Determine the max sequence len for the draft model. This is usually
        the draft_max_model_len, but may be the target_max_model_len if it is
        less than the draft_max_model_len, or may be speculative_max_model_len
        if it is specified.

        This is necessary so that sequences do not exceed the capacity of the
        draft model or the target model.

        speculative_max_model_len is mainly used for testing that sequences can
        skip speculation.
        """

        if speculative_max_model_len is not None:
            if speculative_max_model_len > draft_max_model_len:
                raise ValueError(
                    f"{speculative_max_model_len=} cannot be "
                    f"larger than {draft_max_model_len=}"
                )

            if speculative_max_model_len > target_max_model_len:
                raise ValueError(
                    f"{speculative_max_model_len=} cannot be "
                    f"larger than {target_max_model_len=}"
                )

            return speculative_max_model_len

        result = min(
            draft_max_model_len,
            target_max_model_len,
        )
        if result != draft_max_model_len:
            logger.info(
                "Overriding draft model max model len from %d to %d",
                draft_max_model_len,
                result,
            )
        return result

    @staticmethod
    def _verify_and_get_draft_tp(
        target_parallel_config: ParallelConfig,
        speculative_draft_tensor_parallel_size: int | None,
        draft_hf_config: PretrainedConfig,
    ) -> int:
        """
        Verifies and adjusts the tensor parallel size for a draft model
        specified using speculative_draft_tensor_parallel_size.
        """
        # If speculative_draft_tensor_parallel_size is unset then set it
        # appropriately else verify that it is set correctly.
        if speculative_draft_tensor_parallel_size is None:
            if draft_hf_config.model_type == "mlp_speculator":
                speculative_draft_tensor_parallel_size = 1
                if target_parallel_config.tensor_parallel_size > 1:
                    logger.warning(
                        "%s cannot currently be run with tp>1; "
                        "setting speculative_draft_tensor_parallel_size=1",
                        draft_hf_config.model_type,
                    )
            else:
                speculative_draft_tensor_parallel_size = (
                    target_parallel_config.tensor_parallel_size
                )
        elif speculative_draft_tensor_parallel_size not in (
            1,
            target_parallel_config.tensor_parallel_size,
        ):
            raise ValueError(
                f"{speculative_draft_tensor_parallel_size=} cannot be "
                f"other value than 1 or target model tensor_parallel_size"
            )
        return speculative_draft_tensor_parallel_size

    def update_arch_(self):
        """
        EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
        architectures-related fields in self.draft_model_config
        """
        self.draft_model_config.hf_text_config = get_hf_text_config(
            self.draft_model_config.hf_config
        )
        self.draft_model_config.model_arch_config = (
            self.draft_model_config.get_model_arch_config()
        )
        model_info, arch = self.draft_model_config.registry.inspect_model_cls(
            self.draft_model_config.architectures,
            self.draft_model_config,
        )
        self.draft_model_config._model_info = model_info
        self.draft_model_config._architecture = arch

    @staticmethod
    def create_draft_parallel_config(
        target_parallel_config: ParallelConfig,
        speculative_draft_tensor_parallel_size: int,
    ) -> ParallelConfig:
        """Create a parallel config for use by the draft worker.

        This is mostly a copy of the target parallel config, except the tp_size.
        """
        draft_parallel_config = ParallelConfig(
            pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
            tensor_parallel_size=speculative_draft_tensor_parallel_size,
            distributed_executor_backend=target_parallel_config.distributed_executor_backend,
            max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
            disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
            ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
            placement_group=target_parallel_config.placement_group,
        )

        return draft_parallel_config

    @field_validator("attention_backend", mode="before")
    @classmethod
    def _parse_attention_backend(cls, value: Any) -> Any:
        if isinstance(value, str):
            if value.lower() == "auto":
                return None
            return AttentionBackendEnum[value.upper()]
        return value

    @model_validator(mode="after")
    def _verify_args(self) -> Self:
        if self.tensor_parallel_size is not None:
            raise ValueError(
                "'tensor_parallel_size' is not a valid argument in the "
                "speculative_config. Please pass 'draft_tensor_parallel_size' instead."
            )

        if self.num_speculative_tokens is None:
            raise ValueError(
                "num_speculative_tokens must be provided with "
                "speculative model unless the draft model config contains an "
                "n_predict parameter."
            )

        if self.num_speculative_tokens <= 0:
            raise ValueError(
                "Expected num_speculative_tokens to be greater "
                f"than zero ({self.num_speculative_tokens})."
            )

        if self.rejection_sample_method == "synthetic":
            # Consolidate to per-position rates
            self.synthetic_acceptance_rates = self._resolve_synthetic_acceptance_rates(
                self.num_speculative_tokens,
                self.synthetic_acceptance_rates,
                self.synthetic_acceptance_length,
            )
            self.synthetic_acceptance_length = None
        elif (
            self.synthetic_acceptance_rates is not None
            or self.synthetic_acceptance_length is not None
        ):
            raise ValueError(
                "synthetic_acceptance_rates / synthetic_acceptance_length "
                "are only valid with rejection_sample_method='synthetic'."
            )

        if self.draft_model_config:
            self.draft_model_config.verify_with_parallel_config(
                self.draft_parallel_config
            )

        self.verify_equal_vocab_size_if_draft_model()
        return self

    def verify_equal_vocab_size_if_draft_model(self):
        if (
            self.method == "draft_model"
            and self.target_model_config is not None
            and self.draft_model_config is not None
        ):
            target_vocab_size = self.target_model_config.get_vocab_size()
            draft_vocab_size = self.draft_model_config.get_vocab_size()
            if target_vocab_size != draft_vocab_size:
                raise ValueError(
                    f"Target and draft model should have the same vocabulary size. "
                    f"Target model vocab_size={target_vocab_size}. "
                    f"Draft model vocab_size={draft_vocab_size}. "
                    f"Using models with different tokenizers can cause out-of-bounds "
                    f"errors during speculative decoding."
                )

    @property
    def max_num_new_slots_for_drafting(self) -> int:
        """
        Calculate the maximum number of new slots that might be added to the batch
        when drafting.
        """
        slots_per_req = 0  # for serial non-draft-model methods, no change needed
        if self.parallel_drafting:
            # For parallel drafting, we need one new slot per 'masked' token
            slots_per_req = self.num_speculative_tokens - 1
        if self.uses_draft_model():
            # For draft model-based speculation, we need one new slot per request
            # Since we do not slice the draft tokens
            slots_per_req += 1
        return slots_per_req

    def use_gemma4_mtp(self) -> bool:
        return (
            self.method == "mtp"
            and self.draft_model_config is not None
            and getattr(self.draft_model_config.hf_config, "model_type", None)
            == "gemma4_mtp"
        )

    def use_step3p5_mtp(self) -> bool:
        return (
            self.method == "mtp"
            and self.draft_model_config is not None
            and getattr(self.draft_model_config.hf_config, "model_type", None)
            == "step3p5_mtp"
        )

    def use_eagle(self) -> bool:
        return self.method in ("eagle", "eagle3", "mtp", "dflash")

    def use_dflash(self) -> bool:
        return self.method == "dflash"

    def uses_dynamic_speculative_decoding(self) -> bool:
        return self.num_speculative_tokens_per_batch_size is not None

    def uses_draft_model(self) -> bool:
        return self.method == "draft_model"

    def uses_extract_hidden_states(self) -> bool:
        return self.method == "extract_hidden_states"

    def use_ngram_gpu(self) -> bool:
        return self.method == "ngram_gpu"

    def __repr__(self) -> str:
        method = self.method
        model = (
            None
            if method
            in (
                "ngram",
                "suffix",
                "extract_hidden_states",
                "custom_class",
            )
            else self.draft_model_config.model
        )
        num_spec_tokens = self.num_speculative_tokens
        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"

attention_backend = None class-attribute instance-attribute

Attention backend to use for the draft model. When None, the backend is automatically selected. Useful when the drafter requires a different attention backend (e.g. DFlash needs a non-causal-capable backend like FLASH_ATTN).

code_revision = None class-attribute instance-attribute

The specific revision to use for the draft model code on Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.

disable_padded_drafter_batch = False class-attribute instance-attribute

Disable input padding for speculative decoding. If set to True, speculative input batches can contain sequences of different lengths, which may only be supported by certain attention backends. This currently only affects the EAGLE method of speculation.

draft_load_config = None class-attribute instance-attribute

Load config for the draft model. If not specified, will use the load config from the target model.

draft_model_config = None class-attribute instance-attribute

The configuration of the draft model initialized internal.

draft_parallel_config = None class-attribute instance-attribute

The parallel configuration for the draft model initialized internal.

draft_sample_method = 'greedy' class-attribute instance-attribute

How the draft model samples tokens. 'greedy' always picks the argmax token, and the draft probabilities are treated as one-hot during rejection sampling. 'probabilistic' samples stochastically from the draft distribution and uses the full draft logits for the probability ratio test during rejection sampling. This comes at the cost of additional GPU memory usage.

draft_tensor_parallel_size = Field(default=None, ge=1) class-attribute instance-attribute

The degree of the tensor parallelism for the draft model. Can only be 1 or the same as the target model's tensor parallel size.

enforce_eager = None class-attribute instance-attribute

Override the default enforce_eager from model_config

max_model_len = Field(default=None, ge=1) class-attribute instance-attribute

The maximum model length of the draft model. Used when testing the ability to skip speculation for some sequences.

max_num_new_slots_for_drafting property

Calculate the maximum number of new slots that might be added to the batch when drafting.

method = None class-attribute instance-attribute

The name of the speculative method to use. If users provide and set the model param, the speculative method type will be detected automatically if possible, if model param is not provided, the method name must be provided.

If using ngram method, the related configuration prompt_lookup_max and prompt_lookup_min should be considered.

model = None class-attribute instance-attribute

The name of the draft model, eagle head, or additional weights, if provided.

moe_backend = None class-attribute instance-attribute

MoE backend to use for the draft model. When None, the draft model inherits the target model's --moe-backend setting. Useful when the drafter and generator require different MoE kernels (e.g. quantized generator with unquantized drafter).

num_speculative_tokens = Field(default=None, gt=0) class-attribute instance-attribute

The number of speculative tokens, if provided. It will default to the number in the draft model config if present, otherwise, it is required.

num_speculative_tokens_per_batch_size = None class-attribute instance-attribute

Batch-size schedule used to dynamically choose speculative-token count.

Each entry is (range_start, range_end, num_speculative_tokens) with an inclusive batch-size range.

parallel_drafting = False class-attribute instance-attribute

Enable parallel drafting, where all speculative tokens are generated in parallel rather than sequentially. This can improve performance but requires the speculative model be trained to support parallel drafting. Only compatible with EAGLE and draft model methods.

prompt_lookup_max = Field(default=None, ge=1) class-attribute instance-attribute

Maximum size of ngram token window when using Ngram proposer, required when method is set to ngram.

prompt_lookup_min = Field(default=None, ge=1) class-attribute instance-attribute

Minimum size of ngram token window when using Ngram proposer, if provided. Defaults to 1.

quantization = None class-attribute instance-attribute

Quantization method that was used to quantize the draft model weights. If None, we assume the model weights are not quantized. Note that it only takes effect when using the draft model-based speculative method.

rejection_sample_method = 'standard' class-attribute instance-attribute

The rejection sampling method to use. 'standard' uses probabilistic rejection sampling (with or without cached draft logits, controlled by draft_sample_method). 'synthetic' accepts draft tokens with a decaying probability calibrated to synthetic_acceptance_rate.

revision = None class-attribute instance-attribute

The specific model version to use for the draft model. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.

suffix_decoding_max_cached_requests = 10000 class-attribute instance-attribute

The maximum number of requests to cache in the global suffix tree. If exceeded, will trigger eviction in FIFO order. If set to 0, the global suffix tree is disabled and past responses are not cached (prompt trees are still used).

suffix_decoding_max_spec_factor = 1.0 class-attribute instance-attribute

The maximum spec factor for suffix decoding. The spec factor controls speculation lengths based on the prefix match length: max_spec_tokens = max_spec_factor * prefix_match_length.

suffix_decoding_max_tree_depth = 24 class-attribute instance-attribute

The maximum depth of the suffix decoding global and prompt trees. The tree depth limits the sum of the prefix match and speculation lengths.

suffix_decoding_min_token_prob = 0.1 class-attribute instance-attribute

The minimum token probability for suffix decoding. Will only speculate tokens with estimated probability (based on frequency counts) greater than or equal to this value.

synthetic_acceptance_length = None class-attribute instance-attribute

Target mean acceptance length for synthetic rejection sampling, in [1, num_speculative_tokens + 1]. Resolved internally to synthetic_acceptance_rates. Only valid when rejection_sample_method is 'synthetic'. Mutually exclusive with synthetic_acceptance_rates.

synthetic_acceptance_rates = None class-attribute instance-attribute

Per-position unconditional acceptance rates for synthetic rejection sampling. Position i's entry is the marginal probability that the first i+1 draft tokens are all accepted; the list must have length num_speculative_tokens, each entry in [0, 1], and be monotonically non-increasing. Only valid when rejection_sample_method is 'synthetic'. Mutually exclusive with synthetic_acceptance_length.

target_model_config = None class-attribute instance-attribute

The configuration of the target model.

target_parallel_config = None class-attribute instance-attribute

The parallel configuration for the target model.

tensor_parallel_size = None class-attribute instance-attribute

Users should pass "draft_tensor_parallel_size". This parameter's purpose is to warn users when they mistakenly provide the wrong argument.

use_local_argmax_reduction = False class-attribute instance-attribute

Use vocab-parallel local argmax instead of all-gathering full logits for draft token generation. Reduces communication from O(vocab_size) to O(2 * tp_size) per token. Only applies to greedy draft selection in non-tree speculation.

_acceptance_length_to_rates(length, n) staticmethod

Mean acceptance length to unconditional per-position rates, using the minimum-variance schedule.

Source code in vllm/config/speculative.py
@staticmethod
def _acceptance_length_to_rates(length: float, n: int) -> list[float]:
    """Mean acceptance length to unconditional per-position rates, using
    the minimum-variance schedule."""
    num_drafts = length - 1  # expected number of accepted draft tokens
    num_full = int(num_drafts)
    return (
        [1.0] * num_full + [num_drafts - num_full] + [0.0] * (n - num_full - 1)
    )[:n]

_maybe_override_draft_max_model_len(speculative_max_model_len, draft_max_model_len, target_max_model_len) staticmethod

Determine the max sequence len for the draft model. This is usually the draft_max_model_len, but may be the target_max_model_len if it is less than the draft_max_model_len, or may be speculative_max_model_len if it is specified.

This is necessary so that sequences do not exceed the capacity of the draft model or the target model.

speculative_max_model_len is mainly used for testing that sequences can skip speculation.

Source code in vllm/config/speculative.py
@staticmethod
def _maybe_override_draft_max_model_len(
    speculative_max_model_len: int | None,
    draft_max_model_len: int,
    target_max_model_len: int,
) -> int:
    """Determine the max sequence len for the draft model. This is usually
    the draft_max_model_len, but may be the target_max_model_len if it is
    less than the draft_max_model_len, or may be speculative_max_model_len
    if it is specified.

    This is necessary so that sequences do not exceed the capacity of the
    draft model or the target model.

    speculative_max_model_len is mainly used for testing that sequences can
    skip speculation.
    """

    if speculative_max_model_len is not None:
        if speculative_max_model_len > draft_max_model_len:
            raise ValueError(
                f"{speculative_max_model_len=} cannot be "
                f"larger than {draft_max_model_len=}"
            )

        if speculative_max_model_len > target_max_model_len:
            raise ValueError(
                f"{speculative_max_model_len=} cannot be "
                f"larger than {target_max_model_len=}"
            )

        return speculative_max_model_len

    result = min(
        draft_max_model_len,
        target_max_model_len,
    )
    if result != draft_max_model_len:
        logger.info(
            "Overriding draft model max model len from %d to %d",
            draft_max_model_len,
            result,
        )
    return result

_resolve_synthetic_acceptance_rates(n, rates, length) staticmethod

Return per-position unconditional acceptance rates from exactly one of rates or length (validates range, length, and monotonicity).

Source code in vllm/config/speculative.py
@staticmethod
def _resolve_synthetic_acceptance_rates(
    n: int,
    rates: list[float] | None,
    length: float | None,
) -> list[float]:
    """Return per-position unconditional acceptance rates from exactly one
    of `rates` or `length` (validates range, length, and monotonicity)."""
    if (rates is None) == (length is None):
        raise ValueError(
            "rejection_sample_method='synthetic' requires exactly one of "
            "synthetic_acceptance_rates or synthetic_acceptance_length."
        )
    if rates is not None:
        if len(rates) != n:
            raise ValueError(
                f"synthetic_acceptance_rates must have length {n}, got {rates}."
            )
        if not all(0.0 <= r <= 1.0 for r in rates):
            raise ValueError(
                f"synthetic_acceptance_rates entries must be in [0, 1], "
                f"got {rates}."
            )
        if any(rates[i] > rates[i - 1] for i in range(1, n)):
            raise ValueError(
                f"synthetic_acceptance_rates must be non-increasing, got {rates}."
            )
        return list(rates)
    assert length is not None
    if not 1.0 <= length <= float(n + 1):
        raise ValueError(
            f"synthetic_acceptance_length must be in [1, {n + 1}], got {length}."
        )
    return SpeculativeConfig._acceptance_length_to_rates(length, n)

_verify_and_get_draft_tp(target_parallel_config, speculative_draft_tensor_parallel_size, draft_hf_config) staticmethod

Verifies and adjusts the tensor parallel size for a draft model specified using speculative_draft_tensor_parallel_size.

Source code in vllm/config/speculative.py
@staticmethod
def _verify_and_get_draft_tp(
    target_parallel_config: ParallelConfig,
    speculative_draft_tensor_parallel_size: int | None,
    draft_hf_config: PretrainedConfig,
) -> int:
    """
    Verifies and adjusts the tensor parallel size for a draft model
    specified using speculative_draft_tensor_parallel_size.
    """
    # If speculative_draft_tensor_parallel_size is unset then set it
    # appropriately else verify that it is set correctly.
    if speculative_draft_tensor_parallel_size is None:
        if draft_hf_config.model_type == "mlp_speculator":
            speculative_draft_tensor_parallel_size = 1
            if target_parallel_config.tensor_parallel_size > 1:
                logger.warning(
                    "%s cannot currently be run with tp>1; "
                    "setting speculative_draft_tensor_parallel_size=1",
                    draft_hf_config.model_type,
                )
        else:
            speculative_draft_tensor_parallel_size = (
                target_parallel_config.tensor_parallel_size
            )
    elif speculative_draft_tensor_parallel_size not in (
        1,
        target_parallel_config.tensor_parallel_size,
    ):
        raise ValueError(
            f"{speculative_draft_tensor_parallel_size=} cannot be "
            f"other value than 1 or target model tensor_parallel_size"
        )
    return speculative_draft_tensor_parallel_size

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/speculative.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    factors: list[Any] = []
    # Eagle3 and extract_hidden_states affect the computation graph because
    # they return intermediate hidden states in addition to the final hidden state.
    uses_aux_hidden_states = self.method in (
        "eagle3",
        "extract_hidden_states",
        "dflash",
    )
    factors.append(uses_aux_hidden_states)

    # The specific layers used also affect the computation graph
    if uses_aux_hidden_states and self.draft_model_config is not None:
        layer_ids = getattr(
            self.draft_model_config.hf_config,
            "eagle_aux_hidden_state_layer_ids",
            None,
        )
        if layer_ids is not None:
            # Convert to tuple to make it hashable
            factors.append(tuple(layer_ids))

    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

create_draft_parallel_config(target_parallel_config, speculative_draft_tensor_parallel_size) staticmethod

Create a parallel config for use by the draft worker.

This is mostly a copy of the target parallel config, except the tp_size.

Source code in vllm/config/speculative.py
@staticmethod
def create_draft_parallel_config(
    target_parallel_config: ParallelConfig,
    speculative_draft_tensor_parallel_size: int,
) -> ParallelConfig:
    """Create a parallel config for use by the draft worker.

    This is mostly a copy of the target parallel config, except the tp_size.
    """
    draft_parallel_config = ParallelConfig(
        pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
        tensor_parallel_size=speculative_draft_tensor_parallel_size,
        distributed_executor_backend=target_parallel_config.distributed_executor_backend,
        max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
        disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
        ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
        placement_group=target_parallel_config.placement_group,
    )

    return draft_parallel_config

update_arch_()

EagleConfig and ExtractHiddenStatesConfig update architectures, so update all architectures-related fields in self.draft_model_config

Source code in vllm/config/speculative.py
def update_arch_(self):
    """
    EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
    architectures-related fields in self.draft_model_config
    """
    self.draft_model_config.hf_text_config = get_hf_text_config(
        self.draft_model_config.hf_config
    )
    self.draft_model_config.model_arch_config = (
        self.draft_model_config.get_model_arch_config()
    )
    model_info, arch = self.draft_model_config.registry.inspect_model_cls(
        self.draft_model_config.architectures,
        self.draft_model_config,
    )
    self.draft_model_config._model_info = model_info
    self.draft_model_config._architecture = arch

SpeechToTextConfig

Configuration for speech-to-text models.

Attributes:

Source code in vllm/config/speech_to_text.py
@config
class SpeechToTextConfig:
    """Configuration for speech-to-text models."""

    sample_rate: float = 16_000
    """Sample rate (Hz) to resample input audio to. Most speech models expect
    16kHz audio input. The input audio will be automatically resampled to this
    rate before processing."""

    max_audio_clip_s: int | None = 30
    """Maximum duration in seconds for a single audio clip without chunking.
    Audio longer than this will be split into smaller chunks if
    `allow_audio_chunking` evaluates to True, otherwise it will be rejected. 
    `None` means audio duration can be unlimited and won't be chunked."""

    overlap_chunk_second: int = 1
    """Overlap duration in seconds between consecutive audio chunks when
    splitting long audio. This helps maintain context across chunk boundaries
    and improves transcription quality at split points."""

    min_energy_split_window_size: int | None = 1600
    """Window size in samples for finding low-energy (quiet) regions to split
    audio chunks. The algorithm looks for the quietest moment within this
    window to minimize cutting through speech. Default 1600 samples ≈ 100ms
    at 16kHz. If None, no chunking will be done."""

    @property
    def allow_audio_chunking(self) -> bool:
        return (
            self.min_energy_split_window_size is not None
            and self.max_audio_clip_s is not None
        )

max_audio_clip_s = 30 class-attribute instance-attribute

Maximum duration in seconds for a single audio clip without chunking. Audio longer than this will be split into smaller chunks if allow_audio_chunking evaluates to True, otherwise it will be rejected. None means audio duration can be unlimited and won't be chunked.

min_energy_split_window_size = 1600 class-attribute instance-attribute

Window size in samples for finding low-energy (quiet) regions to split audio chunks. The algorithm looks for the quietest moment within this window to minimize cutting through speech. Default 1600 samples ≈ 100ms at 16kHz. If None, no chunking will be done.

overlap_chunk_second = 1 class-attribute instance-attribute

Overlap duration in seconds between consecutive audio chunks when splitting long audio. This helps maintain context across chunk boundaries and improves transcription quality at split points.

sample_rate = 16000 class-attribute instance-attribute

Sample rate (Hz) to resample input audio to. Most speech models expect 16kHz audio input. The input audio will be automatically resampled to this rate before processing.

SpeechToTextParams dataclass

All parameters consumed by get_generation_prompt().

TranscriptionRequest.build_stt_params() constructs this object, mapping API-level fields into typed attributes. Models only receive this object, so new parameters can be added here without changing the get_generation_prompt signature.

Attributes:

Source code in vllm/config/speech_to_text.py
@dataclass
class SpeechToTextParams:
    """All parameters consumed by ``get_generation_prompt()``.

    ``TranscriptionRequest.build_stt_params()`` constructs this object,
    mapping API-level fields into typed attributes.  Models only receive
    this object, so new parameters can be added here without changing the
    ``get_generation_prompt`` signature.
    """

    audio: np.ndarray
    """Resampled audio waveform for a single chunk."""

    stt_config: SpeechToTextConfig
    """Server-level speech-to-text configuration."""

    model_config: ModelConfig
    """Model configuration."""

    language: str | None = None
    """ISO 639-1 language code (validated / auto-detected)."""

    hotwords: str | None = None
    """
    hotwords refers to a list of important words or phrases that the model
    should pay extra attention to during transcription.
    """

    task_type: str = "transcribe"
    """``"transcribe"`` or ``"translate"``."""

    request_prompt: str = ""
    """Optional text prompt to guide the model."""

    to_language: str | None = None
    """Target language for translation (model-dependent)."""

audio instance-attribute

Resampled audio waveform for a single chunk.

hotwords = None class-attribute instance-attribute

hotwords refers to a list of important words or phrases that the model should pay extra attention to during transcription.

language = None class-attribute instance-attribute

ISO 639-1 language code (validated / auto-detected).

model_config instance-attribute

Model configuration.

request_prompt = '' class-attribute instance-attribute

Optional text prompt to guide the model.

stt_config instance-attribute

Server-level speech-to-text configuration.

task_type = 'transcribe' class-attribute instance-attribute

"transcribe" or "translate".

to_language = None class-attribute instance-attribute

Target language for translation (model-dependent).

StructuredOutputsConfig

Dataclass which contains structured outputs config for the engine.

Methods:

  • compute_hash

    WARNING: Whenever a new field is added to this config,

Attributes:

Source code in vllm/config/structured_outputs.py
@config
class StructuredOutputsConfig:
    """Dataclass which contains structured outputs config for the engine."""

    backend: StructuredOutputsBackend = "auto"
    """Which engine will be used for structured outputs (e.g. JSON schema,
    regex, etc) by default. With "auto", we will make opinionated choices
    based on request contents and what the backend libraries currently support,
    so the behavior is subject to change in each release."""
    disable_any_whitespace: bool = False
    """If `True`, json output will always be compact without any whitespace.
    If `False`, the model may generate whitespace between JSON fields,
    which is still valid JSON. This is only supported for xgrammar
    and guidance backends."""
    disable_additional_properties: bool = False
    """If `True`, the `guidance` backend will not use `additionalProperties`
    in the JSON schema. This is only supported for the `guidance` backend and
    is used to better align its behaviour with `outlines` and `xgrammar`."""
    reasoning_parser: str = ""
    """Select the reasoning parser depending on the model that you're using.
    This is used to parse the reasoning content into OpenAI API format."""
    reasoning_parser_plugin: str = ""
    """Path to a dynamically reasoning parser plugin that can be dynamically
    loaded and registered."""
    enable_in_reasoning: bool = False
    """Whether to use structured input for reasoning."""

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @model_validator(mode="after")
    def _validate_structured_output_config(self) -> Self:
        if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
            raise ValueError(
                "disable_any_whitespace is only supported for "
                "xgrammar and guidance backends."
            )
        if self.disable_additional_properties and self.backend != "guidance":
            raise ValueError(
                "disable_additional_properties is only supported "
                "for the guidance backend."
            )
        return self

backend = 'auto' class-attribute instance-attribute

Which engine will be used for structured outputs (e.g. JSON schema, regex, etc) by default. With "auto", we will make opinionated choices based on request contents and what the backend libraries currently support, so the behavior is subject to change in each release.

disable_additional_properties = False class-attribute instance-attribute

If True, the guidance backend will not use additionalProperties in the JSON schema. This is only supported for the guidance backend and is used to better align its behaviour with outlines and xgrammar.

disable_any_whitespace = False class-attribute instance-attribute

If True, json output will always be compact without any whitespace. If False, the model may generate whitespace between JSON fields, which is still valid JSON. This is only supported for xgrammar and guidance backends.

enable_in_reasoning = False class-attribute instance-attribute

Whether to use structured input for reasoning.

reasoning_parser = '' class-attribute instance-attribute

Select the reasoning parser depending on the model that you're using. This is used to parse the reasoning content into OpenAI API format.

reasoning_parser_plugin = '' class-attribute instance-attribute

Path to a dynamically reasoning parser plugin that can be dynamically loaded and registered.

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/structured_outputs.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str

UVAOffloadConfig

Configuration for UVA (Unified Virtual Addressing) CPU offloading.

Uses zero-copy access from CPU-pinned memory. Simple but requires fast CPU-GPU interconnect.

Attributes:

Source code in vllm/config/offload.py
@config
class UVAOffloadConfig:
    """Configuration for UVA (Unified Virtual Addressing) CPU offloading.

    Uses zero-copy access from CPU-pinned memory. Simple but requires
    fast CPU-GPU interconnect.
    """

    cpu_offload_gb: float = Field(default=0, ge=0)
    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
    no offloading. Intuitively, this argument can be seen as a virtual way to
    increase the GPU memory size. For example, if you have one 24 GB GPU and
    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
    Note that this requires fast CPU-GPU interconnect, as part of the model is
    loaded from CPU memory to GPU memory on the fly in each model forward pass.
    This uses UVA (Unified Virtual Addressing) for zero-copy access.
    """

    cpu_offload_params: set[str] = Field(default_factory=set)
    """The set of parameter name segments to target for CPU offloading.
    Unmatched parameters are not offloaded. If this set is empty, parameters
    are offloaded non-selectively until the memory limit defined by
    `cpu_offload_gb` is reached.
    Examples:
        - For parameter name "mlp.experts.w2_weight":
            - "experts" or "experts.w2_weight" will match.
            - "expert" or "w2" will NOT match (must be exact segments).
    This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".
    """

cpu_offload_gb = Field(default=0, ge=0) class-attribute instance-attribute

The space in GiB to offload to CPU, per GPU. Default is 0, which means no offloading. Intuitively, this argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass. This uses UVA (Unified Virtual Addressing) for zero-copy access.

cpu_offload_params = Field(default_factory=set) class-attribute instance-attribute

The set of parameter name segments to target for CPU offloading. Unmatched parameters are not offloaded. If this set is empty, parameters are offloaded non-selectively until the memory limit defined by cpu_offload_gb is reached. Examples: - For parameter name "mlp.experts.w2_weight": - "experts" or "experts.w2_weight" will match. - "expert" or "w2" will NOT match (must be exact segments). This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".

VllmConfig

Dataclass which contains all vllm-related configuration. This simplifies passing around the distinct configurations in the codebase.

Methods:

Attributes:

Source code in vllm/config/vllm.py
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
@config(config=ConfigDict(arbitrary_types_allowed=True))
class VllmConfig:
    """Dataclass which contains all vllm-related configuration. This
    simplifies passing around the distinct configurations in the codebase.
    """

    # TODO: use default_factory once default constructing ModelConfig doesn't
    # try to download a model
    model_config: ModelConfig = None  # type: ignore[assignment]
    """Model configuration."""
    cache_config: CacheConfig = Field(default_factory=CacheConfig)
    """Cache configuration."""
    parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
    """Parallel configuration."""
    scheduler_config: SchedulerConfig = Field(
        default_factory=SchedulerConfig.default_factory,
    )
    """Scheduler configuration."""
    device_config: DeviceConfig = Field(default_factory=DeviceConfig)
    """Device configuration."""
    load_config: LoadConfig = Field(default_factory=LoadConfig)
    """Load configuration."""
    offload_config: OffloadConfig = Field(default_factory=OffloadConfig)
    """Model weight offloading configuration."""
    attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
    """Attention configuration."""
    mamba_config: MambaConfig = Field(default_factory=MambaConfig)
    """Mamba configuration."""
    kernel_config: KernelConfig = Field(default_factory=KernelConfig)
    """Kernel configuration."""
    lora_config: LoRAConfig | None = None
    """LoRA configuration."""
    speculative_config: SpeculativeConfig | None = None
    """Speculative decoding configuration."""
    diffusion_config: DiffusionConfig | None = None
    """Diffusion LLM (dLLM) configuration."""

    structured_outputs_config: StructuredOutputsConfig = Field(
        default_factory=StructuredOutputsConfig
    )
    """Structured outputs configuration."""
    observability_config: ObservabilityConfig = Field(
        default_factory=ObservabilityConfig
    )
    """Observability configuration."""
    quant_config: QuantizationConfig | None = None
    """Quantization configuration."""
    compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
    """`torch.compile` and cudagraph capture configuration for the model.

    As a shorthand, one can append compilation arguments via
    -cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`).

    You can specify the full compilation config like so:
    `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
    profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
    """Profiling configuration."""
    kv_transfer_config: KVTransferConfig | None = None
    """The configurations for distributed KV cache transfer."""
    kv_events_config: KVEventsConfig | None = None
    """The configurations for event publishing."""
    ec_transfer_config: ECTransferConfig | None = None
    """The configurations for distributed EC cache transfer."""
    reasoning_config: ReasoningConfig | None = None
    """The configurations for reasoning model."""
    # some opaque config, only used to provide additional information
    # for the hash computation, mainly used for testing, debugging or out of
    # tree config registration.
    additional_config: dict | SupportsHash = Field(default_factory=dict)
    """Additional config for specified platform. Different platforms may
    support different configs. Make sure the configs are valid for the platform
    you are using. Contents must be hashable."""
    instance_id: str = ""
    """The ID of the vLLM instance."""
    optimization_level: OptimizationLevel = OptimizationLevel.O2
    """The optimization level. These levels trade startup time cost for
    performance, with -O0 having the best startup time and -O3 having the best
    performance. -O2 is used by default. See OptimizationLevel for full
    description."""

    performance_mode: PerformanceMode = "balanced"
    """Performance mode for runtime behavior, 'balanced' is the default.
    'interactivity' favors low end-to-end per-request latency at small batch
    sizes (fine-grained CUDA graphs, latency-oriented kernels).
    'throughput' favors aggregate tokens/sec at high concurrency (larger CUDA
    graphs, more aggressive batching, throughput-oriented kernels)."""

    weight_transfer_config: WeightTransferConfig | None = None
    """The configurations for weight transfer during RL training."""

    shutdown_timeout: int = Field(default=0, ge=0)
    """Shutdown grace period for in-flight requests. Shutdown will be delayed for
    up to this amount of time to allow already-running requests to complete. Any
    remaining requests are aborted once the timeout is reached.
    """

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        factors: list[Any] = []

        # summarize vllm config
        vllm_factors: list[Any] = []
        from vllm import __version__

        vllm_factors.append(__version__)
        if self.model_config:
            vllm_factors.append(self.model_config.compute_hash())
            if (
                self.compilation_config
                and getattr(self.compilation_config, "compile_mm_encoder", False)
                and self.model_config.multimodal_config
            ):
                vllm_factors.append(self.model_config.multimodal_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.cache_config:
            vllm_factors.append(self.cache_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.parallel_config:
            vllm_factors.append(self.parallel_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.scheduler_config:
            vllm_factors.append(self.scheduler_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.device_config:
            vllm_factors.append(self.device_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.load_config:
            vllm_factors.append(self.load_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.offload_config:
            vllm_factors.append(self.offload_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.attention_config:
            vllm_factors.append(self.attention_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.lora_config:
            vllm_factors.append(self.lora_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.speculative_config:
            vllm_factors.append(self.speculative_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.structured_outputs_config:
            vllm_factors.append(self.structured_outputs_config.compute_hash())
        if self.profiler_config:
            vllm_factors.append(self.profiler_config.compute_hash())
        else:
            vllm_factors.append("None")
        vllm_factors.append(self.observability_config.compute_hash())
        if self.quant_config:
            pass  # should be captured by model_config.quantization
        if self.compilation_config:
            vllm_factors.append(self.compilation_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.kernel_config:
            vllm_factors.append(self.kernel_config.compute_hash())
        else:
            vllm_factors.append(None)
        if self.kv_transfer_config:
            vllm_factors.append(self.kv_transfer_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.ec_transfer_config:
            vllm_factors.append(self.ec_transfer_config.compute_hash())
        else:
            vllm_factors.append("None")
        if self.additional_config:
            if isinstance(additional_config := self.additional_config, dict):
                additional_config_hash = safe_hash(
                    json.dumps(additional_config, sort_keys=True).encode(),
                    usedforsecurity=False,
                ).hexdigest()
            else:
                additional_config_hash = additional_config.compute_hash()
            vllm_factors.append(additional_config_hash)
        else:
            vllm_factors.append("None")
        factors.append(vllm_factors)

        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
            :10
        ]
        return hash_str

    @property
    def max_concurrent_batches(self) -> int:
        # PP requires PP-size concurrent batches to fill the pipeline.
        # Async scheduling requires 2 concurrent batches to overlap.
        pp_size = self.parallel_config.pipeline_parallel_size
        if self.scheduler_config.async_scheduling:
            if self.use_v2_model_runner:
                return pp_size + 1
            # V1 Model Runner does not fully support async scheduling with PP.
            if pp_size <= 1:
                return 2
        return pp_size

    @property
    def num_speculative_tokens(self) -> int:
        if (
            self.speculative_config is not None
            and self.speculative_config.num_speculative_tokens is not None
        ):
            return self.speculative_config.num_speculative_tokens
        if (
            self.diffusion_config is not None
            and self.diffusion_config.canvas_length is not None
        ):
            return self.diffusion_config.canvas_length
        return 0

    @property
    def use_v2_model_runner(self) -> bool:
        use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
        if use_v2_model_runner is not None:
            return use_v2_model_runner

        if self.model_config is not None and self.model_config.is_diffusion:
            return True

        if not self._is_default_v2_model_runner_model():
            return False

        if not HAS_TRITON:
            logger.warning_once(
                "Model Runner V2 requires Triton; using the V1 model runner instead."
            )
            return False

        unsupported = self._get_v2_model_runner_unsupported_features()
        if unsupported:
            logger.warning_once(
                "Model Runner V2 does not yet support %s; using the V1 model "
                "runner instead.",
                ", ".join(unsupported),
            )
            return False

        return True

    def _is_default_v2_model_runner_model(self) -> bool:
        model_config = self.model_config
        if model_config is None:
            return False

        if model_config.runner_type != "generate":
            return False

        architectures = getattr(model_config, "architectures", [])
        return any(
            arch in DEFAULT_V2_MODEL_RUNNER_ARCHITECTURES for arch in architectures
        )

    @property
    def needs_dp_coordinator(self) -> bool:
        """
        Determine if the DPCoordinator process is needed.

        The DPCoordinator is needed in two cases:
        1. For MoE models with DP > 1: to handle wave coordination
           (even in external LB mode, since wave coordination runs in the coordinator)
        2. For non-MoE models in internal/hybrid LB mode: to collect and publish
           queue stats for load balancing across DP ranks

        Returns:
            True if DPCoordinator process is needed, False otherwise.
        """

        # For non-MoE models, only need coordinator in internal/hybrid LB mode
        # (for stats collection).
        return self.parallel_config.data_parallel_size > 1 and (
            self.model_config is None
            or self.model_config.is_moe
            or not self.parallel_config.data_parallel_external_lb
        )

    def enable_trace_function_call_for_thread(self) -> None:
        """
        Set up function tracing for the current thread,
        if enabled via the `VLLM_TRACE_FUNCTION` environment variable.
        """
        if envs.VLLM_TRACE_FUNCTION:
            tmp_dir = tempfile.gettempdir()
            # add username to tmp_dir to avoid permission issues
            tmp_dir = os.path.join(tmp_dir, getpass.getuser())
            filename = (
                f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
                f"_thread_{threading.get_ident()}_at_{datetime.now()}.log"
            ).replace(" ", "_")
            log_path = os.path.join(
                tmp_dir,
                "vllm",
                f"vllm-instance-{self.instance_id}",
                filename,
            )
            os.makedirs(os.path.dirname(log_path), exist_ok=True)
            enable_trace_function_call(log_path)

    @staticmethod
    def _get_quantization_config(
        model_config: ModelConfig, load_config: LoadConfig
    ) -> QuantizationConfig | None:
        """Get the quantization config."""
        from vllm.platforms import current_platform

        if model_config.quantization is not None:
            from vllm.model_executor.model_loader.weight_utils import get_quant_config

            quant_config = get_quant_config(model_config, load_config)
            capability_tuple = current_platform.get_device_capability()

            if capability_tuple is not None:
                capability = capability_tuple.to_int()
                if capability < quant_config.get_min_capability():
                    raise ValueError(
                        f"The quantization method {model_config.quantization} "
                        "is not supported for the current GPU. Minimum "
                        f"capability: {quant_config.get_min_capability()}. "
                        f"Current capability: {capability}."
                    )
            supported_dtypes = quant_config.get_supported_act_dtypes()
            if model_config.dtype not in supported_dtypes:
                raise ValueError(
                    f"{model_config.dtype} is not supported for quantization "
                    f"method {model_config.quantization}. Supported dtypes: "
                    f"{supported_dtypes}"
                )
            quant_config.maybe_update_config(
                model_config.model,
                hf_config=model_config.hf_config,
            )
            return quant_config
        return None

    @staticmethod
    def get_quantization_config(
        model_config: ModelConfig, load_config: LoadConfig
    ) -> QuantizationConfig | None:
        import copy

        # For some reason, the _ version of this modifies the model_config
        # object, so using deepcopy to avoid this problem.
        return VllmConfig._get_quantization_config(
            copy.deepcopy(model_config), load_config
        )

    def with_hf_config(
        self,
        hf_config: PretrainedConfig,
        architectures: list[str] | None = None,
    ) -> "VllmConfig":
        if architectures is not None:
            hf_config = copy.deepcopy(hf_config)
            hf_config.architectures = architectures
        elif hf_config.architectures is None:
            from transformers.models.auto.modeling_auto import (
                MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
            )

            if hf_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
                hf_config = copy.deepcopy(hf_config)
                hf_config.architectures = [
                    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[hf_config.model_type]
                ]

        model_config = copy.deepcopy(self.model_config)

        # In Transformers v5, tie_word_embeddings belongs to the config of the class
        # that can see both layers to be tied. For example:
        #
        # SomeVLModel:
        #   self.language_model = SomeLanguageModel(SomeVLTextConfig)
        #   self.vision_model = SomeVisionModel(SomeVLVisionConfig)
        #
        # SomeVLModelForMultimodalLM:
        #   self.model = SomeVLModel(SomeVLConfig)
        #   self.lm_head = nn.Linear()
        #
        # Therefore, tie_word_embeddings is defined in SomeVLConfig and is not present
        # in SomeVLTextConfig*. In vLLM, the lm_head belongs to the language_model, so
        # we must ensure that tie_word_embeddings is set in the language_model's config.
        #
        # *For some models, SomeVLTextConfig may also have a tie_word_embeddings field.
        # This is only the case if SomeVLTextConfig is also used for a text only version
        # of the same model. For example:
        #
        # SomeVLModelForCausalLM:
        #   self.model = SomeLanguageModel(SomeVLTextConfig)
        #   self.lm_head = nn.Linear()
        #
        # Therefore, the presence of tie_word_embeddings in SomeVLTextConfig cannot
        # be used as a signal for whether tie_word_embeddings should be copied from
        # hf_config to the language_model config.
        if model_config.is_multimodal_model and hasattr(
            model_config.hf_config, "tie_word_embeddings"
        ):
            tie_word_embeddings = model_config.hf_config.tie_word_embeddings
            hf_config.get_text_config().tie_word_embeddings = tie_word_embeddings

        model_config.hf_config = hf_config
        model_config.model_arch_config = model_config.get_model_arch_config()

        return replace(self, model_config=model_config)

    def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
        """Set config attribute to default if not already set by user.

        Args:
            config_obj: Configuration object to update.
            key: Attribute name.
            value: Default value (static or callable).
        """
        if getattr(config_obj, key) is None:
            # Some config values are known before initialization and are
            # hard coded.
            # Other values depend on the user given configuration, so they are
            # implemented with lambda functions and decided at run time.
            setattr(config_obj, key, value(self) if callable(value) else value)

    def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
        """Apply optimization level defaults using self as root.

        Recursively applies values from defaults into nested config objects.
        Only fields present in defaults are overwritten.

        If the user configuration does not specify a value for a default field
        and if the default field is still None after all user selections are
        applied, then default values will be applied to the field. User specified
        fields will not be overridden by the default.

        Args:
            defaults: Dictionary of default values to apply.
        """

        def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
            """Recursively apply defaults to config_obj, using self as root."""
            for key, value in config_defaults.items():
                if not hasattr(config_obj, key):
                    continue

                current = getattr(config_obj, key)
                if isinstance(value, dict) and is_dataclass(current):
                    apply_recursive(current, value)
                else:
                    self._set_config_default(config_obj, key, value)

        apply_recursive(self, defaults)

    def _maybe_override_dynamic_sd_cudagraph_mode(self) -> None:
        speculative_config = self.speculative_config
        if (
            speculative_config is None
            or not speculative_config.uses_dynamic_speculative_decoding()
            or not self.compilation_config.cudagraph_mode.has_full_cudagraphs()
        ):
            return

        logger.warning_once(
            "Dynamic speculative decoding changes the target verification "
            "length at runtime. Overriding cudagraph_mode from %s to "
            "PIECEWISE for reliability.",
            self.compilation_config.cudagraph_mode.name,
        )
        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

    def _post_init_kv_transfer_config(self) -> None:
        """Update KVTransferConfig based on top-level configs in VllmConfig.

        Right now, this function reads the offloading settings from
        CacheConfig and configures the KVTransferConfig accordingly.
        """
        # KV offloading is only activated when kv_offloading_size is set.
        if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
            return

        kv_offloading_backend = self.cache_config.kv_offloading_backend

        # If no KVTransferConfig is provided, create a default one.
        if self.kv_transfer_config is None:
            self.kv_transfer_config = KVTransferConfig()

        if kv_offloading_backend == "native":
            if envs.VLLM_USE_SIMPLE_KV_OFFLOAD:
                config_connector = "SimpleCPUOffloadConnector"
            else:
                config_connector = "OffloadingConnector"
            self.kv_transfer_config.kv_connector = config_connector
            self.kv_transfer_config.kv_connector_extra_config.update(
                {"cpu_bytes_to_use": kv_offloading_size * (1 << 30)}
            )
        elif kv_offloading_backend == "lmcache":
            # Default to LMCache multi-process (MP) mode. The actual KV
            # storage capacity is managed by the standalone LMCache server
            # process, so ``kv_offloading_size`` is not propagated here.
            # ``LMCacheMPConnector`` falls back to ``tcp://localhost:5555``
            # when host/port are not provided via extra_config.
            self.kv_transfer_config.kv_connector = "LMCacheMPConnector"

        # This is the same for all backends
        self.kv_transfer_config.kv_role = "kv_both"

    def _verify_kv_transfer_compat(self) -> None:
        """Reject configurations that silently corrupt KV transfers."""
        if (
            self.kv_transfer_config is None
            or self.kv_transfer_config.kv_connector is None
        ):
            return

        # PyTorch's expandable_segments allocator uses CUDA VMM, which can
        # remap a virtual address range to different physical pages over the
        # engine's lifetime. KV connectors that pin KV cache memory (e.g.
        # NixlConnector via ibv_reg_mr, MooncakeConnector) end up with their
        # registrations pointing at stale physical pages after any remap,
        # producing RDMA failures like IBV_WC_REM_ACCESS_ERR /
        # NIXL_ERR_REMOTE_DISCONNECT at the first inter-node KV transfer.
        # We can't enumerate every in-tree and out-of-tree connector that
        # pins memory, so we conservatively reject the combination whenever
        # any KV connector is configured.
        #
        # CuMem allocator is exempt: CuMemAllocator.use_memory_pool toggles
        # expandable_segments off around its pool (see #40812), so the KV
        # cache allocated within that context lands on stable physical pages
        # even when the env var is set.
        if "expandable_segments:True" not in os.environ.get(
            "PYTORCH_CUDA_ALLOC_CONF", ""
        ):
            return
        if self.model_config is not None and (self.model_config.enable_cumem_allocator):
            return

        raise ValueError(
            f"KV connector {self.kv_transfer_config.kv_connector} is "
            "incompatible with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True "
            "unless enable_cumem_allocator is also enabled. PyTorch's CUDA VMM "
            "allocator can remap KV cache virtual addresses to different "
            "physical pages, invalidating any pinned/registered KV memory "
            "(e.g. IB memory regions registered by NIXL or Mooncake). Either "
            "unset expandable_segments:True or enable the cumem allocator "
            "(sleep mode does this automatically and also "
            "routes KV allocations through CuMemAllocator's pool, where "
            "expandable_segments is automatically disabled)."
        )

    def __post_init__(self):
        """Verify configs are valid & consistent with each other."""

        # To give each torch profile run a unique instance name.
        self.instance_id = f"{time.time_ns()}"

        if self.performance_mode != "balanced":
            logger.info_once("Performance mode set to '%s'.", self.performance_mode)

        self.try_verify_and_update_config()

        if self.model_config is not None:
            self.model_config.verify_with_parallel_config(self.parallel_config)
            self.model_config.verify_dual_chunk_attention_config(self.load_config)

            self.parallel_config.is_moe_model = self.model_config.is_moe

        if (
            self.model_config is not None
            and self.model_config.enable_return_routed_experts
        ):
            if self.parallel_config.pipeline_parallel_size > 1:
                raise ValueError(
                    "--enable-return-routed-experts is incompatible with "
                    "pipeline parallelism (PP > 1)."
                )

            # Incompatible with any KV connector — covers both PD disaggregation
            # (kv_producer/kv_consumer: routing captured on P can't reach D) and
            # single-instance KV offload/sharing (kv_both: slot_mapping semantics
            # change when KV blocks live outside local GPU memory, breaking the
            # slot-indexed routed_experts buffer).
            if (
                self.kv_transfer_config is not None
                and self.kv_transfer_config.is_kv_transfer_instance
            ):
                raise ValueError(
                    "--enable-return-routed-experts is incompatible with KV "
                    "connectors (PD disaggregation, KV cache offload)."
                )

        if self.lora_config is not None:
            self.lora_config.verify_with_model_config(self.model_config)

        if (
            self.mamba_config.enable_stochastic_rounding
            and self.cache_config.mamba_ssm_cache_dtype != "float16"
        ):
            raise ValueError(
                "Stochastic rounding for Mamba cache requires "
                "the SSM cache to be float16. Please set it explicitly, "
                "by specifying `--mamba-ssm-cache-dtype float16`, or disable "
                "stochastic rounding by not specifying "
                "`--enable-mamba-cache-stochastic-rounding`."
            )

        if self.quant_config is None and self.model_config is not None:
            self.quant_config = VllmConfig._get_quantization_config(
                self.model_config, self.load_config
            )

        if (
            self.quant_config is not None
            and self.model_config is not None
            and hasattr(self.quant_config, "use_deep_gemm")
            and self.quant_config.use_deep_gemm is None
        ):
            from vllm.utils.deep_gemm import should_auto_disable_deep_gemm

            model_type = getattr(self.model_config.hf_text_config, "model_type", None)
            if should_auto_disable_deep_gemm(model_type):
                self.quant_config.use_deep_gemm = False
                logger.warning_once(
                    "Auto-disabled DeepGemm for model_type=%s on Blackwell. "
                    "DeepGemm E8M0 scale format causes accuracy degradation "
                    "for this architecture. Falling back to CUTLASS. "
                    "To disable DeepGemm globally, set VLLM_USE_DEEP_GEMM=0.",
                    model_type,
                )

        from vllm.platforms import current_platform
        from vllm.v1.executor.abstract import Executor

        executor_backend = self.parallel_config.distributed_executor_backend
        executor_class = Executor.get_class(self)
        executor_supports_async_sched = executor_class.supports_async_scheduling()
        uses_rocm_deepep_ht_dbo = (
            current_platform.is_rocm()
            and self.parallel_config.enable_dbo
            and self.parallel_config.all2all_backend == "deepep_high_throughput"
        )

        if self.scheduler_config.async_scheduling:
            # Async scheduling explicitly enabled, hard fail any incompatibilities.
            # Currently, async scheduling only support eagle speculative
            # decoding.
            if uses_rocm_deepep_ht_dbo:
                raise ValueError(
                    "Async scheduling is not compatible with ROCm DeepEP "
                    "high-throughput DBO. Please use --no-async-scheduling or "
                    "select a different all2all backend."
                )
            if self.speculative_config is not None:
                if (
                    self.speculative_config.method not in get_args(EagleModelTypes)
                    and self.speculative_config.method not in get_args(NgramGPUTypes)
                    and self.speculative_config.method != "draft_model"
                ):
                    raise ValueError(
                        "Currently, async scheduling is only supported "
                        "with EAGLE/MTP/Draft Model/NGram GPU kind of "
                        "speculative decoding"
                    )
                if self.speculative_config.disable_padded_drafter_batch:
                    raise ValueError(
                        "Async scheduling is not compatible with "
                        "disable_padded_drafter_batch=True."
                    )
            if not executor_supports_async_sched:
                raise ValueError(
                    f"`{executor_backend}` does not support async scheduling yet."
                )
        elif self.scheduler_config.async_scheduling is None:
            # Enable async scheduling unless there is an incompatible option.
            if (
                self.model_config is not None
                and self.model_config.runner_type == "pooling"
            ):
                # The current implementation of asynchronous scheduling negatively
                # impacts performance of pooling models, so we disable by default.
                logger.debug(
                    "Disabling asynchronous scheduling by default for pooling model."
                )
                self.scheduler_config.async_scheduling = False
            elif (
                self.speculative_config is not None
                and self.speculative_config.method not in get_args(EagleModelTypes)
                and self.speculative_config.method not in get_args(NgramGPUTypes)
            ):
                logger.warning_once(
                    "Async scheduling not supported with %s-based "
                    "speculative decoding and will be disabled.",
                    self.speculative_config.method,
                )
                self.scheduler_config.async_scheduling = False
            elif (
                self.speculative_config is not None
                and self.speculative_config.disable_padded_drafter_batch
            ):
                logger.warning_once(
                    "Async scheduling is not compatible with "
                    "disable_padded_drafter_batch=True and will be disabled.",
                )
                self.scheduler_config.async_scheduling = False
            elif not executor_supports_async_sched:
                logger.warning_once(
                    "Async scheduling will be disabled because it is not supported "
                    "with the `%s` distributed executor backend. ",
                    executor_backend,
                )
                self.scheduler_config.async_scheduling = False
            elif uses_rocm_deepep_ht_dbo:
                logger.warning_once(
                    "Async scheduling is disabled for ROCm DeepEP "
                    "high-throughput DBO because that combination can corrupt "
                    "DP+EP generation accuracy."
                )
                self.scheduler_config.async_scheduling = False
            else:
                self.scheduler_config.async_scheduling = True

        logger.info_once(
            "Asynchronous scheduling is %s.",
            "enabled" if self.scheduler_config.async_scheduling else "disabled",
        )

        if self.parallel_config.disable_nccl_for_dp_synchronization is None:
            if self.scheduler_config.async_scheduling:
                if self.parallel_config.data_parallel_size > 1 and (
                    self.model_config is None or self.model_config.is_moe
                ):
                    logger.info_once(
                        "Disabling NCCL for DP synchronization "
                        "when using async scheduling.",
                    )
                self.parallel_config.disable_nccl_for_dp_synchronization = True
            else:
                self.parallel_config.disable_nccl_for_dp_synchronization = False

        if (
            self.speculative_config is not None
            and self.scheduler_config.async_scheduling
            and self.model_config is not None
            and not self.model_config.disable_cascade_attn
        ):
            logger.warning_once(
                "Disabling cascade attention (not yet compatible with "
                "async speculative decoding).",
            )
            self.model_config.disable_cascade_attn = True

        if (
            self.model_config is not None
            and self.model_config.multimodal_config is not None
            and self.model_config.multimodal_config.mm_tensor_ipc == "torch_shm"
            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
        ):
            raise ValueError(
                "torch_shm is known to fail without "
                "VLLM_WORKER_MULTIPROC_METHOD set to spawn"
            )

        if (
            self.model_config is not None
            and self.scheduler_config.enable_chunked_prefill
            and self.model_config.dtype == torch.float32
            and current_platform.get_device_capability() == (7, 5)
        ):
            logger.warning_once(
                "Turing devices tensor cores do not support float32 matmul. "
                "To workaround this limitation, vLLM will set 'ieee' input "
                "precision for chunked prefill triton kernels."
            )

        if self.model_config is not None and self.model_config.enforce_eager:
            logger.warning(
                "Enforce eager set, disabling torch.compile and CUDAGraphs. "
                "This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none"
            )
            self.compilation_config.mode = CompilationMode.NONE
            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE

        if os.environ.get("TORCH_COMPILE_DISABLE") == "1":
            logger.warning(
                "TORCH_COMPILE_DISABLE is set, disabling torch.compile. "
                "This is equivalent to setting -cc.mode=none"
            )
            self.compilation_config.mode = CompilationMode.NONE

        # For model classes don't carry @support_torch_compile —
        # the breakable cudagraph is the supported PIECEWISE path. Auto-enable
        # it unless the user has explicitly opted out via the env var.
        if (
            self.model_config is not None
            and "VLLM_USE_BREAKABLE_CUDAGRAPH" not in os.environ
            and any(
                a
                in (
                    "DeepseekV4ForCausalLM",
                    "DeepSeekV4MTPModel",
                    "MiniMaxM3SparseForCausalLM",
                    "MiniMaxM3SparseForConditionalGeneration",
                )
                for a in self.model_config.architectures
            )
        ):
            os.environ["VLLM_USE_BREAKABLE_CUDAGRAPH"] = "1"
            logger.info_once(
                "Auto-enabling VLLM_USE_BREAKABLE_CUDAGRAPH=1. "
                "Set VLLM_USE_BREAKABLE_CUDAGRAPH=0 to opt out."
            )

        if envs.VLLM_USE_BREAKABLE_CUDAGRAPH:
            logger.warning_once(
                "VLLM_USE_BREAKABLE_CUDAGRAPH is set, disabling vLLM's "
                "torch.compile pipeline. Equivalent to -cc.mode=none."
            )
            self.compilation_config.mode = CompilationMode.NONE

        if self.compilation_config.backend == "eager" or (
            self.compilation_config.mode is not None
            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
        ):
            logger.warning(
                "Inductor compilation was disabled by user settings, "
                "optimizations settings that are only active during "
                "inductor compilation will be ignored."
            )

        def has_blocked_weights():
            if self.quant_config is not None:
                if hasattr(self.quant_config, "weight_block_size"):
                    return self.quant_config.weight_block_size is not None
                elif hasattr(self.quant_config, "has_blocked_weights"):
                    return self.quant_config.has_blocked_weights()
            return False

        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
        # On H100 the CUDA kernel is faster than
        # native implementation
        # https://github.com/vllm-project/vllm/issues/25094
        if has_blocked_weights():
            custom_ops = self.compilation_config.custom_ops
            if "-quant_fp8" not in custom_ops:
                custom_ops.append("+quant_fp8")

        current_platform.apply_config_platform_defaults(self)

        if self.compilation_config.mode is None:
            if self.optimization_level > OptimizationLevel.O0:
                self.compilation_config.mode = CompilationMode.VLLM_COMPILE
            else:
                self.compilation_config.mode = CompilationMode.NONE

        # By default, enable torch wrapping only when using custom Inductor lowering
        if self.compilation_config.ir_enable_torch_wrap is None:
            self.compilation_config.ir_enable_torch_wrap = (
                self.compilation_config.mode == CompilationMode.VLLM_COMPILE
                and self.compilation_config.backend == "inductor"
            )

        if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
            if (
                self.compilation_config.backend == "inductor"
                and self.compilation_config.mode != CompilationMode.NONE
            ):
                self.compilation_config.custom_ops.append("none")
            else:
                self.compilation_config.custom_ops.append("all")

        # This populates IR op priorities,
        # must happen after compilation mode and backend are decided,
        # but before fusion defaults are applied as those may depend on op priority.
        self.kernel_config.set_platform_defaults(self)

        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
        self._apply_optimization_level_defaults(default_config)
        if self.kernel_config.enable_flashinfer_autotune is None:
            raise ValueError(
                "KernelConfig.enable_flashinfer_autotune must be set after applying "
                "optimization level defaults."
            )

        self._maybe_override_dynamic_sd_cudagraph_mode()

        if (
            self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
            and not envs.VLLM_USE_BREAKABLE_CUDAGRAPH
        ):
            logger.info(
                "Cudagraph mode %s is not compatible with compilation mode %s."
                "Overriding to NONE.",
                self.compilation_config.cudagraph_mode,
                self.compilation_config.mode,
            )
            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE

        # async tp is built on top of sequence parallelism and requires it.
        pass_config = self.compilation_config.pass_config
        if pass_config.fuse_gemm_comms:
            pass_config.enable_sp = True
        if pass_config.enable_sp:
            if self.parallel_config.tensor_parallel_size == 1:
                logger.warning("Sequence Parallelism requires TP>1, disabling")
                pass_config.enable_sp = False
                pass_config.fuse_gemm_comms = False
            else:
                if pass_config.sp_min_token_num is None:
                    from vllm.compilation.passes.fusion.sequence_parallelism import (
                        get_sequence_parallelism_threshold,
                    )

                    tp_size = self.parallel_config.tensor_parallel_size
                    hidden_size = self.model_config.get_hidden_size()
                    assert isinstance(self.model_config.dtype, torch.dtype)
                    element_size = self.model_config.dtype.itemsize
                    pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                        hidden_size, tp_size, element_size
                    )

                if pass_config.sp_min_token_num is None:
                    logger.warning(
                        "Model hidden_size too small for the SP "
                        "threshold heuristic, disabling. To force SP, "
                        "set pass_config.sp_min_token_num manually."
                    )
                    pass_config.enable_sp = False
                    pass_config.fuse_gemm_comms = False

        from vllm.utils.torch_utils import HAS_OPAQUE_TYPE

        if HAS_OPAQUE_TYPE:
            # On torch >= 2.11 the hoisted OpaqueObject approach supersedes
            # fast_moe_cold_start, so force it off.
            self.compilation_config.fast_moe_cold_start = False
        elif self.compilation_config.fast_moe_cold_start is None:
            # resolve default behavior: try to be as safe as possible
            # this config is unsafe if any spec decoding draft model has a MOE.
            # We'll conservatively turn it off if we see spec decoding.
            self.compilation_config.fast_moe_cold_start = (
                self.speculative_config is None
            )

        self._set_max_num_scheduled_tokens()

        if current_platform.support_static_graph_mode():
            # if cudagraph_mode has full cudagraphs, we need to check support
            if model_config := self.model_config:
                if (
                    self.compilation_config.cudagraph_mode.has_full_cudagraphs()
                    and model_config.pooler_config is not None
                ):
                    logger.warning_once(
                        "Pooling models do not support full cudagraphs. "
                        "Overriding cudagraph_mode to PIECEWISE."
                    )
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                elif (
                    model_config.is_encoder_decoder
                    and self.compilation_config.cudagraph_mode
                    not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
                ):
                    logger.info_once(
                        "Encoder-decoder models do not support %s. "
                        "Overriding cudagraph_mode to FULL_DECODE_ONLY.",
                        self.compilation_config.cudagraph_mode.name,
                    )
                    self.compilation_config.cudagraph_mode = (
                        CUDAGraphMode.FULL_DECODE_ONLY
                    )

            # Check if KV connector requires PIECEWISE mode for CUDA graphs
            if (
                self.kv_transfer_config is not None
                and self.kv_transfer_config.is_kv_transfer_instance
                and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
            ):
                # Lazy import to avoid circular dependencies
                from vllm.distributed.kv_transfer.kv_connector.factory import (
                    KVConnectorFactory,
                )

                connector_cls = KVConnectorFactory.get_connector_class(
                    self.kv_transfer_config
                )
                if connector_cls.requires_piecewise_for_cudagraph(
                    self.kv_transfer_config.kv_connector_extra_config
                ):
                    logger.warning_once(
                        "KV connector %s requires PIECEWISE CUDA graph mode "
                        "due to layerwise async operations that cannot be "
                        "captured in CUDA graphs. "
                        "Overriding cudagraph_mode from %s to PIECEWISE.",
                        connector_cls.__name__,
                        self.compilation_config.cudagraph_mode.name,
                    )
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

            # disable cudagraph when enforce eager execution
            if self.model_config is not None and self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")
                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
                # override related settings when enforce eager
                self.compilation_config.max_cudagraph_capture_size = 0
                self.compilation_config.cudagraph_capture_sizes = []
            else:
                self.compilation_config.cudagraph_num_of_warmups = 1

            self._set_cudagraph_sizes()

        else:
            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE

        if self.cache_config.kv_sharing_fast_prefill:
            if (
                self.speculative_config is not None
                and self.speculative_config.use_eagle()
            ):
                raise ValueError(
                    "Fast prefill optimization for KV sharing is not "
                    "compatible with EAGLE as EAGLE requires correct logits "
                    "for all tokens while fast prefill gives incorrect logits "
                    "for prompt tokens."
                )

            logger.warning_once(
                "--kv-sharing-fast-prefill requires changes on model side for "
                "correctness and to realize prefill savings."
            )

        if (
            self.model_config
            and self.model_config.architecture == "WhisperForConditionalGeneration"
            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
        ):
            logger.warning(
                "Whisper is known to have issues with "
                "forked workers. If startup is hanging, "
                "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
                "to 'spawn'."
            )

        if (
            self.kv_events_config is not None
            and self.kv_events_config.enable_kv_cache_events
            and not self.cache_config.enable_prefix_caching
        ):
            logger.warning(
                "KV cache events are on, but prefix caching is not enabled. "
                "Use --enable-prefix-caching to enable."
            )
        if (
            self.kv_events_config is not None
            and self.kv_events_config.publisher != "null"
            and not self.kv_events_config.enable_kv_cache_events
        ):
            logger.warning(
                "KV cache events are disabled, "
                "but the scheduler is configured to publish them. "
                "Modify KVEventsConfig.enable_kv_cache_events "
                "to True to enable."
            )
        current_platform.check_and_update_config(self)

        if self.use_v2_model_runner:
            self._validate_v2_model_runner()

        # Re-compute compile ranges after platform-specific config updates
        # (e.g., XPU may lower max_num_batched_tokens when MLA is enabled)
        self._set_compile_ranges()

        # Do this after all the updates to compilation_config.mode
        effective_dp_size = (
            self.parallel_config.data_parallel_size
            if self.model_config is None or self.model_config.is_moe
            else 1
        )
        self.compilation_config.set_splitting_ops_for_v1(
            all2all_backend=self.parallel_config.all2all_backend,
            data_parallel_size=effective_dp_size,
        )

        if self.compilation_config.pass_config.enable_sp:
            # With pipeline parallelism, native rms norm tracing errors due to
            # incorrect residual shape.
            # Use custom rms norm to unblock. In the future,
            # the pass will operate on higher-level IR to avoid the issue.
            # TODO: https://github.com/vllm-project/vllm/issues/27894
            if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
                logger.warning(
                    "Sequence parallelism is enabled, but running in wrong "
                    "vllm compile mode: %s.",
                    self.compilation_config.mode,
                )

            if self.parallel_config.pipeline_parallel_size > 1:
                if "-rms_norm" not in self.compilation_config.custom_ops:
                    self.compilation_config.custom_ops.append("+rms_norm")
                else:
                    logger.warning_once(
                        "Sequence parallelism not supported with "
                        "native rms_norm when using %s, "
                        "this will likely lead to an error.",
                        "pipeline parallelism",
                    )

        # final check of cudagraph mode after all possible updates
        if current_platform.is_cuda_alike():
            if (
                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
                and self.model_config is not None
                and not self.model_config.disable_cascade_attn
                and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()  # noqa: E501
            ):
                logger.warning_once(
                    "No piecewise cudagraph for executing cascade attention. "
                    "Will fall back to eager execution if a batch runs into "
                    "cascade attentions."
                )

            if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
                assert (
                    self.compilation_config.mode == CompilationMode.VLLM_COMPILE
                    or envs.VLLM_USE_BREAKABLE_CUDAGRAPH
                ), (
                    "Compilation mode should be CompilationMode.VLLM_COMPILE "
                    "when cudagraph_mode piecewise cudagraphs is used, "
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                )
        if (
            self.model_config
            and envs.VLLM_BATCH_INVARIANT
            and not self.model_config.disable_cascade_attn
        ):
            self.model_config.disable_cascade_attn = True
            logger.warning_once(
                "Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
            )

        if self.parallel_config.use_ubatching:
            a2a_backend = self.parallel_config.all2all_backend
            assert a2a_backend in [
                "deepep_low_latency",
                "deepep_high_throughput",
                "nixl_ep",
            ], (
                "Microbatching currently only supports the deepep_low_latency, "
                "deepep_high_throughput, and nixl_ep all2all backends. "
                f"{a2a_backend} is not supported. To fix use "
                "--all2all-backend=deepep_low_latency, "
                "--all2all-backend=deepep_high_throughput, or "
                "--all2all-backend=nixl_ep and install the matching kernels."
            )

            if not self.model_config.disable_cascade_attn:
                self.model_config.disable_cascade_attn = True
                logger.warning_once("Disabling cascade attention when DBO is enabled.")

        if not self.instance_id:
            self.instance_id = random_uuid()[:5]

        if self.reasoning_config is not None and self.model_config is not None:
            self.reasoning_config.initialize_token_ids(self.model_config)
            if not self.reasoning_config.enabled:
                logger.warning_once(
                    "Auto-initialization of reasoning token IDs failed. "
                    "Please check whether your reasoning parser has implemented "
                    "the `reasoning_start_str` and `reasoning_end_str`."
                )

        # Resolve kv_offloading-derived connector name into kv_transfer_config
        # before the HMA check below, which inspects the connector class.
        self._post_init_kv_transfer_config()

        # Hybrid KV cache manager (HMA) runtime rules:
        # - Explicit enable (--no-disable-kv-cache-manager): error if runtime
        #   disables it
        # - No preference: auto-disable for unsupported features or connector configs
        # - Explicit disable (--disable-kv-cache-manager): always respect it
        need_disable_hybrid_kv_cache_manager = False
        # logger should only print warning message for hybrid models. As we
        # can't know whether the model is hybrid or not now, so we don't log
        # warning message here and will log it later.
        if not current_platform.support_hybrid_kv_cache():
            # Hybrid KV cache manager is not supported on non-GPU platforms.
            need_disable_hybrid_kv_cache_manager = True
        if (
            self.model_config is not None
            and self.model_config.attention_chunk_size is not None
        ):
            if (
                self.speculative_config is not None
                and self.speculative_config.use_eagle()
            ):
                # Hybrid KV cache manager is not yet supported with chunked
                # local attention + eagle.
                need_disable_hybrid_kv_cache_manager = True
            elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
                logger.warning(
                    "There is a latency regression when using chunked local"
                    " attention with the hybrid KV cache manager. Disabling"
                    " it, by default. To enable it, set the environment "
                    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
                )
                # Hybrid KV cache manager is not yet supported with chunked
                # local attention.
                need_disable_hybrid_kv_cache_manager = True

        if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
            # Auto-disable HMA only when the connector config does not support it.
            if self.kv_transfer_config is not None:
                from vllm.distributed.kv_transfer.kv_connector.factory import (
                    KVConnectorFactory,
                )

                if not KVConnectorFactory.supports_hma_config(self.kv_transfer_config):
                    need_disable_hybrid_kv_cache_manager = True
                    logger.warning(
                        "Turning off hybrid kv cache manager because "
                        "`--kv-transfer-config` selects a KV connector that "
                        "does not support it. Impact: hybrid SSM models "
                        "(e.g. Jamba, Bamba) require HMA and will fail at "
                        "startup without it; models with sliding window "
                        "attention will run with reduced performance. "
                        "To add HMA support to a KV connector, subclass "
                        "`SupportsHMA` defined in kv_connector/v1/base.py "
                        "(for MultiConnector, all child connectors must "
                        "support HMA)."
                    )
            self.scheduler_config.disable_hybrid_kv_cache_manager = (
                need_disable_hybrid_kv_cache_manager
            )
        elif (
            self.scheduler_config.disable_hybrid_kv_cache_manager is False
            and need_disable_hybrid_kv_cache_manager
        ):
            raise ValueError(
                "Hybrid KV cache manager was explicitly enabled but is not "
                "supported in this configuration. Consider omitting the "
                "--no-disable-hybrid-kv-cache-manager flag to let vLLM decide"
                " automatically."
            )

        if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
            # Default to enable HMA if not explicitly disabled by user or logic above.
            self.scheduler_config.disable_hybrid_kv_cache_manager = False

        if self.compilation_config.debug_dump_path:
            self.compilation_config.debug_dump_path = (
                self.compilation_config.debug_dump_path.absolute().expanduser()
            )
        if envs.VLLM_DEBUG_DUMP_PATH is not None:
            env_path = Path(envs.VLLM_DEBUG_DUMP_PATH).absolute().expanduser()
            if self.compilation_config.debug_dump_path:
                logger.warning(
                    "Config-specified debug dump path is overridden"
                    " by VLLM_DEBUG_DUMP_PATH to %s",
                    env_path,
                )
            self.compilation_config.debug_dump_path = env_path

        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
        # On H100 the CUDA kernel is faster than
        # native implementation
        # https://github.com/vllm-project/vllm/issues/25094
        if has_blocked_weights():
            custom_ops = self.compilation_config.custom_ops
            if "-quant_fp8" not in custom_ops:
                custom_ops.append("+quant_fp8")

        self._verify_kv_transfer_compat()
        # Log the custom passes that are enabled
        self.compilation_config.pass_config.log_enabled_passes()

    def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list:
        # remove the sizes that not multiple of tp_size when
        # enable sequence parallelism
        removed_sizes = [
            size
            for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size != 0
        ]
        if removed_sizes:
            logger.warning(
                "Batch sizes %s are removed because they are not "
                "multiple of tp_size %d when "
                "sequence parallelism is enabled",
                removed_sizes,
                self.parallel_config.tensor_parallel_size,
            )

        return [
            size
            for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size == 0
        ]

    def _set_max_num_scheduled_tokens(self):
        """
        In most cases, the scheduler may schedule a batch with as many tokens as the
        worker is configured to handle. However for some speculative decoding methods,
        the drafter model may insert additional slots into the batch when drafting.
        To account for this, we need to decrease the max_num_scheduled_tokens by an
        upper bound on the number of slots that can be added.
        """
        if self.speculative_config is not None:
            scheduled_token_delta = (
                self.speculative_config.max_num_new_slots_for_drafting
                * self.scheduler_config.max_num_seqs
            )
            max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
            if self.scheduler_config.max_num_scheduled_tokens is None:
                self.scheduler_config.max_num_scheduled_tokens = (
                    max_num_batched_tokens - scheduled_token_delta
                )

            if self.scheduler_config.max_num_scheduled_tokens <= 0:
                raise ValueError(
                    "max_num_scheduled_tokens is set to"
                    f" {self.scheduler_config.max_num_scheduled_tokens} based on"
                    " the speculative decoding settings, which does not allow"
                    " any tokens to be scheduled. Increase max_num_batched_tokens"
                    " to accommodate the additional draft token slots, or decrease"
                    " num_speculative_tokens or max_num_seqs."
                )
            if self.scheduler_config.max_num_scheduled_tokens < 8192:
                logger.warning_once(
                    "max_num_scheduled_tokens is set to"
                    f" {self.scheduler_config.max_num_scheduled_tokens} based on"
                    " the speculative decoding settings. This may lead to suboptimal"
                    " performance. Consider increasing max_num_batched_tokens to"
                    " accommodate the additional draft token slots, or decrease"
                    " num_speculative_tokens or max_num_seqs.",
                )

            max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens
            if max_num_batched_tokens < max_num_scheduled_tokens + (
                self.speculative_config.max_num_new_slots_for_drafting
                * self.scheduler_config.max_num_seqs
            ):
                raise ValueError(
                    f"VllmConfig received max_num_scheduled_tokens but it does not have"
                    " enough slots to support the speculative decoding settings."
                    f" It should be greater by at least {scheduled_token_delta}, but"
                    f" got {max_num_batched_tokens=} and {max_num_scheduled_tokens=}."
                )

    def _set_cudagraph_sizes(self):
        """
        vLLM defines the default candidate list of batch sizes for CUDA graph
        capture as:

        ```python
        max_graph_size = min(max_num_seqs * 2, 512)
        # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
        # up to max_graph_size
        cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
            range(256, max_graph_size + 1, 16))

        `max_num_batched_tokens` is also appended to the list if it fits
        within `max_cudagraph_capture_size`, so the max batch size is captured
        even when off-stride.

        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
        will be the final sizes to capture cudagraph (in ascending order).

        These sizes are used to capture and reuse CUDA graphs for
        performance-critical paths (e.g., decoding). Capturing enables
        significantly faster kernel dispatch by avoiding Python overhead. The
        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
        most GPUs), which controls the total allowed number of tokens in a
        batch. Since each sequence may have a variable number of tokens, the
        maximum usable batch size will depend on actual sequence lengths.

        Example:
            With `max_num_batched_tokens = 8192`, and typical sequences
            averaging ~32 tokens, most practical batch sizes fall below 256.
            However, the system will still allow capture sizes up to 512 if
            shape and memory permit.

        Note:
            If users explicitly specify cudagraph capture sizes in the
            compilation config, those will override this default logic.
            At runtime:

            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
            padded CUDA graph will be used.
            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
            not be used.
        """

        if (
            self.model_config is not None
            and not self.model_config.enforce_eager
            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
        ):
            # determine the initial max_cudagraph_capture_size
            max_cudagraph_capture_size = (
                self.compilation_config.max_cudagraph_capture_size
            )
            if max_cudagraph_capture_size is None:
                decode_query_len = 1 + self.num_speculative_tokens
                max_cudagraph_capture_size = min(
                    self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
                )
            max_num_tokens = self.scheduler_config.max_num_batched_tokens
            max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)

            assert max_cudagraph_capture_size >= 1, (
                "Maximum cudagraph size should be greater than or equal to 1 "
                "when using cuda graph."
            )

            # determine the cudagraph_capture_sizes
            if self.compilation_config.cudagraph_capture_sizes is not None:
                assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
                    "cudagraph_capture_sizes should contain at least one element "
                    "when using cuda graph."
                )
                # de-duplicate the sizes provided by the config
                dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
                cudagraph_capture_sizes = [
                    i for i in dedup_sizes if i <= max_num_tokens
                ]
                # sort to make sure the sizes are in ascending order
                cudagraph_capture_sizes.sort()
            else:
                if self.performance_mode == "interactivity":
                    # Fine-grained CUDA graphs at small batch sizes
                    # for minimal padding overhead
                    interactivity_max = min(max_cudagraph_capture_size, 32)
                    cudagraph_capture_sizes = list(range(1, interactivity_max + 1))
                else:
                    cudagraph_capture_sizes = [
                        i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
                    ]
                if max_cudagraph_capture_size >= 8:
                    # Step size 8 for small batch sizes, up to 256(not included)
                    cudagraph_capture_sizes += list(
                        range(8, min(max_cudagraph_capture_size + 1, 256), 8)
                    )
                if max_cudagraph_capture_size >= 256:
                    # Step size 16 for larger batch sizes
                    cudagraph_capture_sizes += list(
                        range(256, max_cudagraph_capture_size + 1, 16)
                    )
                # ensure max_num_tokens is captured if within max capture size
                if (
                    max_num_tokens <= max_cudagraph_capture_size
                    and max_num_tokens not in cudagraph_capture_sizes
                ):
                    cudagraph_capture_sizes.append(max_num_tokens)
                # de-duplicate and sort the sizes
                cudagraph_capture_sizes = sorted(set(cudagraph_capture_sizes))

            if (
                self.parallel_config.tensor_parallel_size > 1
                and self.compilation_config.pass_config.enable_sp
            ):
                cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
                    cudagraph_capture_sizes
                )

            # user-specific compilation_config.max_cudagraph_capture_size get
            # truncated to valid_max_size when they are inconsistent.
            valid_max_size = (
                cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
            )
            if (
                self.compilation_config.max_cudagraph_capture_size is not None
                and self.compilation_config.max_cudagraph_capture_size != valid_max_size
            ):
                # raise error only when both two flags are user-specified
                # and they are inconsistent with each other
                if self.compilation_config.cudagraph_capture_sizes is not None:
                    raise ValueError(
                        "customized max_cudagraph_capture_size"
                        f"(={self.compilation_config.max_cudagraph_capture_size}) "
                        "should be consistent with the max value of "
                        f"cudagraph_capture_sizes(={valid_max_size})"
                    )

                logger.warning(
                    "Truncating max_cudagraph_capture_size to %d",
                    valid_max_size,
                )
            # always set the final max_cudagraph_capture_size
            self.compilation_config.max_cudagraph_capture_size = valid_max_size

            if self.compilation_config.cudagraph_capture_sizes is not None and len(
                cudagraph_capture_sizes
            ) < len(self.compilation_config.cudagraph_capture_sizes):
                # If users have specified capture sizes, we only need to
                # compare the lens before and after modification since the modified
                # list is only the subset of the original list.
                logger.warning(
                    (
                        "cudagraph_capture_sizes specified in compilation_config"
                        " %s is overridden by config %s"
                    ),
                    self.compilation_config.cudagraph_capture_sizes,
                    cudagraph_capture_sizes,
                )
            # always write back the final sizes
            self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes

        else:
            # no cudagraph in use
            self.compilation_config.max_cudagraph_capture_size = 0
            self.compilation_config.cudagraph_capture_sizes = []

        # complete the remaining process.
        self.compilation_config.post_init_cudagraph_sizes()

    def _set_compile_ranges(self):
        """
        Set the compile ranges for the compilation config.
        """
        compilation_config = self.compilation_config
        computed_compile_ranges_endpoints = []

        # The upper bound of the compile ranges is the max_num_batched_tokens.
        compile_range_end = self.scheduler_config.max_num_batched_tokens
        if compile_range_end is not None:
            computed_compile_ranges_endpoints.append(compile_range_end)

        # Add the compile ranges for flashinfer/aiter.
        if compilation_config.pass_config.fuse_allreduce_rms:
            tp_size = self.parallel_config.tensor_parallel_size
            from vllm._aiter_ops import rocm_aiter_ops

            if rocm_aiter_ops.is_enabled():
                max_size = rocm_aiter_ops.get_aiter_allreduce_max_size()
            else:
                max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
            if max_size is not None and self.model_config is not None:
                assert isinstance(self.model_config.dtype, torch.dtype)
                max_token_num = max_size // (
                    self.model_config.get_hidden_size()
                    * self.model_config.dtype.itemsize
                )
                if compile_range_end is not None and max_token_num < compile_range_end:
                    computed_compile_ranges_endpoints.append(max_token_num)
                else:
                    logger.debug(
                        "Max num batched tokens below allreduce-rms fusion threshold, "
                        "allreduce-rms fusion will be enabled for all num_tokens."
                    )

        # Add the compile ranges for sequence parallelism
        if compilation_config.pass_config.enable_sp:
            pass_config = compilation_config.pass_config

            # Calculate min_token_num if not explicitly provided
            # User override works regardless of hidden_size
            if pass_config.sp_min_token_num is None:
                from vllm.compilation.passes.fusion.sequence_parallelism import (
                    get_sequence_parallelism_threshold,
                )

                tp_size = self.parallel_config.tensor_parallel_size
                hidden_size = self.model_config.get_hidden_size()
                assert isinstance(self.model_config.dtype, torch.dtype)
                element_size = self.model_config.dtype.itemsize
                pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                    hidden_size, tp_size, element_size
                )

            min_token_num = pass_config.sp_min_token_num
            max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
            if min_token_num is not None and (
                max_num_batched_tokens is not None
                and min_token_num < max_num_batched_tokens
                and min_token_num > 1
            ):
                # Add endpoint at min_token_num - 1 to ensure SP applies
                # starting from min_token_num
                # This creates ranges: [1, min-1] (no SP), [min, max] (SP applies)
                computed_compile_ranges_endpoints.append(min_token_num - 1)

        if compilation_config.pass_config.fuse_rope_kvcache:
            max_token_num = (
                compilation_config.pass_config.rope_kvcache_fusion_max_token_num
            )
            if max_token_num is not None:
                if compile_range_end is not None and max_token_num < compile_range_end:
                    computed_compile_ranges_endpoints.append(max_token_num)
                else:
                    logger.debug(
                        "Max num batched tokens below rope+kvcache fusion threshold, "
                        "rope+kvcache fusion enabled for num_tokens <= %d.",
                        compile_range_end,
                    )

        if compilation_config.compile_ranges_endpoints is not None:
            for x in compilation_config.compile_ranges_endpoints:
                assert isinstance(x, int)
                assert x > 0, f"Invalid compile range endpoint: {x}"
                if compile_range_end is not None and x < compile_range_end and x > 1:
                    computed_compile_ranges_endpoints.append(x)
        compilation_config.compile_ranges_endpoints = sorted(
            computed_compile_ranges_endpoints
        )

    def try_verify_and_update_config(self):
        if self.model_config is None:
            return

        # Avoid running try_verify_and_update_config multiple times
        if getattr(self.model_config, "config_updated", False):
            return
        self.model_config.config_updated = True

        architecture = self.model_config.architecture
        if architecture is None:
            return

        from vllm.model_executor.models.config import (
            MODELS_CONFIG_MAP,
            HybridAttentionMambaModelConfig,
        )

        cls = MODELS_CONFIG_MAP.get(architecture, None)
        if cls is not None:
            cls.verify_and_update_config(self)

        if self.model_config.is_hybrid:
            HybridAttentionMambaModelConfig.verify_and_update_config(self)

        if self.model_config.convert_type == "classify":
            # Maybe convert ForCausalLM into ForSequenceClassification model.
            from vllm.model_executor.models.adapters import SequenceClassificationConfig

            SequenceClassificationConfig.verify_and_update_config(self)

        if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
            self.model_config.model_weights
        ):
            if self.load_config.load_format == "auto":
                logger.info(
                    "Detected Run:ai model config. "
                    "Overriding `load_format` to 'runai_streamer'"
                )
                self.load_config.load_format = "runai_streamer"
            elif self.load_config.load_format not in (
                "modelexpress",
                "runai_streamer",
                "runai_streamer_sharded",
            ):
                raise ValueError(
                    f"To load a model from object storage (S3/GCS/Azure), "
                    f"'load_format' must be 'modelexpress', 'runai_streamer' or "
                    f"'runai_streamer_sharded', "
                    f"but got '{self.load_config.load_format}'. "
                    f"Model: {self.model_config.model}"
                )

    def compile_debug_dump_path(self) -> Path | None:
        """Returns a rank-aware path for dumping
        torch.compile debug information.
        """
        if self.compilation_config.debug_dump_path is None:
            return None
        tp_rank = self.parallel_config.rank
        dp_rank = self.parallel_config.data_parallel_index
        append_path = f"rank_{tp_rank}_dp_{dp_rank}"
        path = self.compilation_config.debug_dump_path / append_path
        return path

    def __str__(self):
        return (
            f"model={self.model_config.model!r}, "
            f"speculative_config={self.speculative_config!r}, "
            f"tokenizer={self.model_config.tokenizer!r}, "
            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
            f"revision={self.model_config.revision}, "
            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
            f"trust_remote_code={self.model_config.trust_remote_code}, "
            f"dtype={self.model_config.dtype}, "
            f"max_seq_len={self.model_config.max_model_len}, "
            f"download_dir={self.load_config.download_dir!r}, "
            f"load_format={self.load_config.load_format}, "
            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
            f"decode_context_parallel_size={self.parallel_config.decode_context_parallel_size}, "  # noqa
            f"dcp_comm_backend={self.parallel_config.dcp_comm_backend}, "  # noqa
            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
            f"quantization={self.model_config.quantization}, "
            f"quantization_config={self.model_config.quantization_config}, "  # noqa
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, "  # noqa
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
            f"device_config={self.device_config.device}, "
            f"structured_outputs_config={self.structured_outputs_config!r}, "
            f"observability_config={self.observability_config!r}, "
            f"seed={self.model_config.seed}, "
            f"served_model_name={self.model_config.served_model_name}, "
            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
            f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, "  # noqa
            f"pooler_config={self.model_config.pooler_config!r}, "
            f"compilation_config={self.compilation_config!r}, "
            f"kernel_config={self.kernel_config!r}"
        )

    def _get_v2_model_runner_unsupported_features(self) -> list[str]:
        """Collect features not yet supported by the V2 model runner."""
        unsupported: list[str] = []
        model_config = self.model_config
        speculative_config = self.speculative_config

        if self.parallel_config.prefill_context_parallel_size > 1:
            unsupported.append("prefill context parallelism")

        if self.compilation_config.mode == CompilationMode.STOCK_TORCH_COMPILE:
            unsupported.append("stock torch.compile")

        if (
            self.compilation_config.pass_config.enable_sp
            and self.parallel_config.tensor_parallel_size > 1
        ):
            unsupported.append("sequence parallelism")

        # V2 does not implement the external_launcher (torchrun) PP-output
        # broadcast that V1 uses to keep all ranks in sync (broadcast_pp_output).
        if (
            self.parallel_config.distributed_executor_backend == "external_launcher"
            and self.parallel_config.pipeline_parallel_size > 1
        ):
            unsupported.append("pipeline parallelism with external_launcher")

        if speculative_config is not None:
            # TODO: ngram / ngram_gpu are not supported by the v2 model runner yet
            if speculative_config.method in ("ngram", "ngram_gpu"):
                unsupported.append("ngram/ngram_gpu speculative decoding")
            elif speculative_config.method not in ("eagle", "eagle3", "mtp", "dflash"):
                unsupported.append(f"speculative method '{speculative_config.method}'")

            if speculative_config.uses_dynamic_speculative_decoding():
                unsupported.append("dynamic speculative decoding")

            # V2 EagleSpeculator does not support parallel_drafting (for P-Eagle)
            # DFlash uses parallel drafting natively in V2 via DFlashSpeculator.
            if (
                speculative_config.parallel_drafting
                and speculative_config.method != "dflash"
            ):
                unsupported.append("parallel drafting for EAGLE speculative decoding")

            if (
                speculative_config.method == "eagle3"
                and self.parallel_config.pipeline_parallel_size > 1
            ):
                unsupported.append("EAGLE3 with pipeline parallelism")

        if self.parallel_config.enable_dbo:
            unsupported.append("dual batch overlap")

        if self.parallel_config.enable_elastic_ep:
            unsupported.append("elastic expert parallelism")

        if model_config is not None and model_config.enable_return_routed_experts:
            # Will be added by https://github.com/vllm-project/vllm/pull/38163
            unsupported.append("routed experts capture")

        has_logitsproc_plugins = False
        if model_config is not None:
            from importlib.metadata import entry_points

            has_logitsproc_plugins = bool(entry_points(group="vllm.logits_processors"))

        if model_config is not None and (
            model_config.logits_processors or has_logitsproc_plugins
        ):
            unsupported.append("custom logits processors")

        if model_config is not None and model_config.enable_prompt_embeds:
            unsupported.append("prompt embeds")

        if (
            model_config is not None
            and model_config.runner_type == "generate"
            and model_config.logprobs_mode in ("raw_logits", "processed_logits")
        ):
            unsupported.append(f"logprobs mode '{model_config.logprobs_mode}'")

        if self.cache_config.kv_sharing_fast_prefill:
            # Will be added by https://github.com/vllm-project/vllm/pull/35045
            unsupported.append("KV sharing fast prefill")

        if self.ec_transfer_config is not None:
            # Will be added by https://github.com/vllm-project/vllm/pull/38390
            unsupported.append("EC transfer")

        return unsupported

    def _validate_v2_model_runner(self) -> None:
        """Check for features not yet supported by the V2 model runner."""
        if not HAS_TRITON:
            raise ValueError("Model Runner V2 requires Triton.")

        unsupported = self._get_v2_model_runner_unsupported_features()
        if unsupported:
            raise ValueError(
                f"Model Runner V2 does not yet support: {', '.join(unsupported)}"
            )

        if self.reasoning_config is not None:
            logger.warning_once(
                "Model Runner V2 does not yet support the thinking_token_budget "
                "request parameter. Set VLLM_USE_V2_MODEL_RUNNER=0 if this is required."
            )

    def validate_block_size(self) -> None:
        """Validate block_size against DCP and mamba constraints.

        Called after Platform.update_block_size_for_backend() has
        finalised block_size.
        """
        block_size = self.cache_config.block_size

        # DCP interleave-size compatibility
        if self.parallel_config.decode_context_parallel_size > 1:
            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
                self.parallel_config.cp_kv_cache_interleave_size
                != self.parallel_config.dcp_kv_cache_interleave_size
            ):
                self.parallel_config.cp_kv_cache_interleave_size = (
                    self.parallel_config.dcp_kv_cache_interleave_size
                )
                logger.warning_once(
                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
                    "deprecated when PCP is fully supported."
                )
            assert (
                self.parallel_config.cp_kv_cache_interleave_size <= block_size
                and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
            ), (
                f"Block_size({block_size}) should be greater "
                "than or equal to and divisible by cp_kv_cache_interleave_size "
                f"({self.parallel_config.cp_kv_cache_interleave_size})."
            )

        # Mamba cache align-mode constraints
        if self.cache_config.mamba_cache_mode == "align":
            assert block_size <= self.scheduler_config.max_num_batched_tokens, (
                "In Mamba cache align mode, block_size "
                f"({block_size}) must be <= "
                "max_num_batched_tokens "
                f"({self.scheduler_config.max_num_batched_tokens})."
            )
            if self.scheduler_config.long_prefill_token_threshold > 0:
                assert self.scheduler_config.long_prefill_token_threshold >= block_size
            assert not self.scheduler_config.disable_chunked_mm_input, (
                "Chunked MM input is required because we need the flexibility "
                "to schedule a multiple of block_size tokens even if they are "
                "in the middle of a mm input"
            )

    @model_validator(mode="after")
    def validate_nvfp4_kv_cache_with_mla(self) -> "VllmConfig":
        if self.model_config is None:
            return self
        if self.cache_config.cache_dtype == "nvfp4" and self.model_config.use_mla:
            raise ValueError(
                "nvfp4 KV cache is not supported with MLA (Multi-head Latent "
                "Attention) backends. Please use a different --kv-cache-dtype "
                "(e.g., 'fp8' or 'auto') for MLA models such as DeepSeek."
            )
        return self

    @model_validator(mode="after")
    def validate_mamba_block_size(self) -> "VllmConfig":
        if self.model_config is None:
            return self
        mamba_block_size_is_set = (
            self.cache_config.mamba_block_size is not None
            and self.cache_config.mamba_block_size != self.model_config.max_model_len
        )
        if mamba_block_size_is_set and not self.cache_config.enable_prefix_caching:
            raise ValueError(
                "--mamba-block-size can only be set with --enable-prefix-caching"
            )
        return self

additional_config = Field(default_factory=dict) class-attribute instance-attribute

Additional config for specified platform. Different platforms may support different configs. Make sure the configs are valid for the platform you are using. Contents must be hashable.

attention_config = Field(default_factory=AttentionConfig) class-attribute instance-attribute

Attention configuration.

cache_config = Field(default_factory=CacheConfig) class-attribute instance-attribute

Cache configuration.

compilation_config = Field(default_factory=CompilationConfig) class-attribute instance-attribute

torch.compile and cudagraph capture configuration for the model.

As a shorthand, one can append compilation arguments via -cc.parameter=argument such as -cc.mode=3 (same as -cc='{"mode":3}').

You can specify the full compilation config like so: {"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}

device_config = Field(default_factory=DeviceConfig) class-attribute instance-attribute

Device configuration.

diffusion_config = None class-attribute instance-attribute

Diffusion LLM (dLLM) configuration.

ec_transfer_config = None class-attribute instance-attribute

The configurations for distributed EC cache transfer.

instance_id = '' class-attribute instance-attribute

The ID of the vLLM instance.

kernel_config = Field(default_factory=KernelConfig) class-attribute instance-attribute

Kernel configuration.

kv_events_config = None class-attribute instance-attribute

The configurations for event publishing.

kv_transfer_config = None class-attribute instance-attribute

The configurations for distributed KV cache transfer.

load_config = Field(default_factory=LoadConfig) class-attribute instance-attribute

Load configuration.

lora_config = None class-attribute instance-attribute

LoRA configuration.

mamba_config = Field(default_factory=MambaConfig) class-attribute instance-attribute

Mamba configuration.

model_config = None class-attribute instance-attribute

Model configuration.

needs_dp_coordinator property

Determine if the DPCoordinator process is needed.

The DPCoordinator is needed in two cases: 1. For MoE models with DP > 1: to handle wave coordination (even in external LB mode, since wave coordination runs in the coordinator) 2. For non-MoE models in internal/hybrid LB mode: to collect and publish queue stats for load balancing across DP ranks

Returns:

  • bool

    True if DPCoordinator process is needed, False otherwise.

observability_config = Field(default_factory=ObservabilityConfig) class-attribute instance-attribute

Observability configuration.

offload_config = Field(default_factory=OffloadConfig) class-attribute instance-attribute

Model weight offloading configuration.

optimization_level = OptimizationLevel.O2 class-attribute instance-attribute

The optimization level. These levels trade startup time cost for performance, with -O0 having the best startup time and -O3 having the best performance. -O2 is used by default. See OptimizationLevel for full description.

parallel_config = Field(default_factory=ParallelConfig) class-attribute instance-attribute

Parallel configuration.

performance_mode = 'balanced' class-attribute instance-attribute

Performance mode for runtime behavior, 'balanced' is the default. 'interactivity' favors low end-to-end per-request latency at small batch sizes (fine-grained CUDA graphs, latency-oriented kernels). 'throughput' favors aggregate tokens/sec at high concurrency (larger CUDA graphs, more aggressive batching, throughput-oriented kernels).

profiler_config = Field(default_factory=ProfilerConfig) class-attribute instance-attribute

Profiling configuration.

quant_config = None class-attribute instance-attribute

Quantization configuration.

reasoning_config = None class-attribute instance-attribute

The configurations for reasoning model.

scheduler_config = Field(default_factory=(SchedulerConfig.default_factory)) class-attribute instance-attribute

Scheduler configuration.

shutdown_timeout = Field(default=0, ge=0) class-attribute instance-attribute

Shutdown grace period for in-flight requests. Shutdown will be delayed for up to this amount of time to allow already-running requests to complete. Any remaining requests are aborted once the timeout is reached.

speculative_config = None class-attribute instance-attribute

Speculative decoding configuration.

structured_outputs_config = Field(default_factory=StructuredOutputsConfig) class-attribute instance-attribute

Structured outputs configuration.

weight_transfer_config = None class-attribute instance-attribute

The configurations for weight transfer during RL training.

__post_init__()

Verify configs are valid & consistent with each other.

Source code in vllm/config/vllm.py
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
def __post_init__(self):
    """Verify configs are valid & consistent with each other."""

    # To give each torch profile run a unique instance name.
    self.instance_id = f"{time.time_ns()}"

    if self.performance_mode != "balanced":
        logger.info_once("Performance mode set to '%s'.", self.performance_mode)

    self.try_verify_and_update_config()

    if self.model_config is not None:
        self.model_config.verify_with_parallel_config(self.parallel_config)
        self.model_config.verify_dual_chunk_attention_config(self.load_config)

        self.parallel_config.is_moe_model = self.model_config.is_moe

    if (
        self.model_config is not None
        and self.model_config.enable_return_routed_experts
    ):
        if self.parallel_config.pipeline_parallel_size > 1:
            raise ValueError(
                "--enable-return-routed-experts is incompatible with "
                "pipeline parallelism (PP > 1)."
            )

        # Incompatible with any KV connector — covers both PD disaggregation
        # (kv_producer/kv_consumer: routing captured on P can't reach D) and
        # single-instance KV offload/sharing (kv_both: slot_mapping semantics
        # change when KV blocks live outside local GPU memory, breaking the
        # slot-indexed routed_experts buffer).
        if (
            self.kv_transfer_config is not None
            and self.kv_transfer_config.is_kv_transfer_instance
        ):
            raise ValueError(
                "--enable-return-routed-experts is incompatible with KV "
                "connectors (PD disaggregation, KV cache offload)."
            )

    if self.lora_config is not None:
        self.lora_config.verify_with_model_config(self.model_config)

    if (
        self.mamba_config.enable_stochastic_rounding
        and self.cache_config.mamba_ssm_cache_dtype != "float16"
    ):
        raise ValueError(
            "Stochastic rounding for Mamba cache requires "
            "the SSM cache to be float16. Please set it explicitly, "
            "by specifying `--mamba-ssm-cache-dtype float16`, or disable "
            "stochastic rounding by not specifying "
            "`--enable-mamba-cache-stochastic-rounding`."
        )

    if self.quant_config is None and self.model_config is not None:
        self.quant_config = VllmConfig._get_quantization_config(
            self.model_config, self.load_config
        )

    if (
        self.quant_config is not None
        and self.model_config is not None
        and hasattr(self.quant_config, "use_deep_gemm")
        and self.quant_config.use_deep_gemm is None
    ):
        from vllm.utils.deep_gemm import should_auto_disable_deep_gemm

        model_type = getattr(self.model_config.hf_text_config, "model_type", None)
        if should_auto_disable_deep_gemm(model_type):
            self.quant_config.use_deep_gemm = False
            logger.warning_once(
                "Auto-disabled DeepGemm for model_type=%s on Blackwell. "
                "DeepGemm E8M0 scale format causes accuracy degradation "
                "for this architecture. Falling back to CUTLASS. "
                "To disable DeepGemm globally, set VLLM_USE_DEEP_GEMM=0.",
                model_type,
            )

    from vllm.platforms import current_platform
    from vllm.v1.executor.abstract import Executor

    executor_backend = self.parallel_config.distributed_executor_backend
    executor_class = Executor.get_class(self)
    executor_supports_async_sched = executor_class.supports_async_scheduling()
    uses_rocm_deepep_ht_dbo = (
        current_platform.is_rocm()
        and self.parallel_config.enable_dbo
        and self.parallel_config.all2all_backend == "deepep_high_throughput"
    )

    if self.scheduler_config.async_scheduling:
        # Async scheduling explicitly enabled, hard fail any incompatibilities.
        # Currently, async scheduling only support eagle speculative
        # decoding.
        if uses_rocm_deepep_ht_dbo:
            raise ValueError(
                "Async scheduling is not compatible with ROCm DeepEP "
                "high-throughput DBO. Please use --no-async-scheduling or "
                "select a different all2all backend."
            )
        if self.speculative_config is not None:
            if (
                self.speculative_config.method not in get_args(EagleModelTypes)
                and self.speculative_config.method not in get_args(NgramGPUTypes)
                and self.speculative_config.method != "draft_model"
            ):
                raise ValueError(
                    "Currently, async scheduling is only supported "
                    "with EAGLE/MTP/Draft Model/NGram GPU kind of "
                    "speculative decoding"
                )
            if self.speculative_config.disable_padded_drafter_batch:
                raise ValueError(
                    "Async scheduling is not compatible with "
                    "disable_padded_drafter_batch=True."
                )
        if not executor_supports_async_sched:
            raise ValueError(
                f"`{executor_backend}` does not support async scheduling yet."
            )
    elif self.scheduler_config.async_scheduling is None:
        # Enable async scheduling unless there is an incompatible option.
        if (
            self.model_config is not None
            and self.model_config.runner_type == "pooling"
        ):
            # The current implementation of asynchronous scheduling negatively
            # impacts performance of pooling models, so we disable by default.
            logger.debug(
                "Disabling asynchronous scheduling by default for pooling model."
            )
            self.scheduler_config.async_scheduling = False
        elif (
            self.speculative_config is not None
            and self.speculative_config.method not in get_args(EagleModelTypes)
            and self.speculative_config.method not in get_args(NgramGPUTypes)
        ):
            logger.warning_once(
                "Async scheduling not supported with %s-based "
                "speculative decoding and will be disabled.",
                self.speculative_config.method,
            )
            self.scheduler_config.async_scheduling = False
        elif (
            self.speculative_config is not None
            and self.speculative_config.disable_padded_drafter_batch
        ):
            logger.warning_once(
                "Async scheduling is not compatible with "
                "disable_padded_drafter_batch=True and will be disabled.",
            )
            self.scheduler_config.async_scheduling = False
        elif not executor_supports_async_sched:
            logger.warning_once(
                "Async scheduling will be disabled because it is not supported "
                "with the `%s` distributed executor backend. ",
                executor_backend,
            )
            self.scheduler_config.async_scheduling = False
        elif uses_rocm_deepep_ht_dbo:
            logger.warning_once(
                "Async scheduling is disabled for ROCm DeepEP "
                "high-throughput DBO because that combination can corrupt "
                "DP+EP generation accuracy."
            )
            self.scheduler_config.async_scheduling = False
        else:
            self.scheduler_config.async_scheduling = True

    logger.info_once(
        "Asynchronous scheduling is %s.",
        "enabled" if self.scheduler_config.async_scheduling else "disabled",
    )

    if self.parallel_config.disable_nccl_for_dp_synchronization is None:
        if self.scheduler_config.async_scheduling:
            if self.parallel_config.data_parallel_size > 1 and (
                self.model_config is None or self.model_config.is_moe
            ):
                logger.info_once(
                    "Disabling NCCL for DP synchronization "
                    "when using async scheduling.",
                )
            self.parallel_config.disable_nccl_for_dp_synchronization = True
        else:
            self.parallel_config.disable_nccl_for_dp_synchronization = False

    if (
        self.speculative_config is not None
        and self.scheduler_config.async_scheduling
        and self.model_config is not None
        and not self.model_config.disable_cascade_attn
    ):
        logger.warning_once(
            "Disabling cascade attention (not yet compatible with "
            "async speculative decoding).",
        )
        self.model_config.disable_cascade_attn = True

    if (
        self.model_config is not None
        and self.model_config.multimodal_config is not None
        and self.model_config.multimodal_config.mm_tensor_ipc == "torch_shm"
        and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
    ):
        raise ValueError(
            "torch_shm is known to fail without "
            "VLLM_WORKER_MULTIPROC_METHOD set to spawn"
        )

    if (
        self.model_config is not None
        and self.scheduler_config.enable_chunked_prefill
        and self.model_config.dtype == torch.float32
        and current_platform.get_device_capability() == (7, 5)
    ):
        logger.warning_once(
            "Turing devices tensor cores do not support float32 matmul. "
            "To workaround this limitation, vLLM will set 'ieee' input "
            "precision for chunked prefill triton kernels."
        )

    if self.model_config is not None and self.model_config.enforce_eager:
        logger.warning(
            "Enforce eager set, disabling torch.compile and CUDAGraphs. "
            "This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none"
        )
        self.compilation_config.mode = CompilationMode.NONE
        self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE

    if os.environ.get("TORCH_COMPILE_DISABLE") == "1":
        logger.warning(
            "TORCH_COMPILE_DISABLE is set, disabling torch.compile. "
            "This is equivalent to setting -cc.mode=none"
        )
        self.compilation_config.mode = CompilationMode.NONE

    # For model classes don't carry @support_torch_compile —
    # the breakable cudagraph is the supported PIECEWISE path. Auto-enable
    # it unless the user has explicitly opted out via the env var.
    if (
        self.model_config is not None
        and "VLLM_USE_BREAKABLE_CUDAGRAPH" not in os.environ
        and any(
            a
            in (
                "DeepseekV4ForCausalLM",
                "DeepSeekV4MTPModel",
                "MiniMaxM3SparseForCausalLM",
                "MiniMaxM3SparseForConditionalGeneration",
            )
            for a in self.model_config.architectures
        )
    ):
        os.environ["VLLM_USE_BREAKABLE_CUDAGRAPH"] = "1"
        logger.info_once(
            "Auto-enabling VLLM_USE_BREAKABLE_CUDAGRAPH=1. "
            "Set VLLM_USE_BREAKABLE_CUDAGRAPH=0 to opt out."
        )

    if envs.VLLM_USE_BREAKABLE_CUDAGRAPH:
        logger.warning_once(
            "VLLM_USE_BREAKABLE_CUDAGRAPH is set, disabling vLLM's "
            "torch.compile pipeline. Equivalent to -cc.mode=none."
        )
        self.compilation_config.mode = CompilationMode.NONE

    if self.compilation_config.backend == "eager" or (
        self.compilation_config.mode is not None
        and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
    ):
        logger.warning(
            "Inductor compilation was disabled by user settings, "
            "optimizations settings that are only active during "
            "inductor compilation will be ignored."
        )

    def has_blocked_weights():
        if self.quant_config is not None:
            if hasattr(self.quant_config, "weight_block_size"):
                return self.quant_config.weight_block_size is not None
            elif hasattr(self.quant_config, "has_blocked_weights"):
                return self.quant_config.has_blocked_weights()
        return False

    # Enable quant_fp8 CUDA ops (TODO disable in follow up)
    # On H100 the CUDA kernel is faster than
    # native implementation
    # https://github.com/vllm-project/vllm/issues/25094
    if has_blocked_weights():
        custom_ops = self.compilation_config.custom_ops
        if "-quant_fp8" not in custom_ops:
            custom_ops.append("+quant_fp8")

    current_platform.apply_config_platform_defaults(self)

    if self.compilation_config.mode is None:
        if self.optimization_level > OptimizationLevel.O0:
            self.compilation_config.mode = CompilationMode.VLLM_COMPILE
        else:
            self.compilation_config.mode = CompilationMode.NONE

    # By default, enable torch wrapping only when using custom Inductor lowering
    if self.compilation_config.ir_enable_torch_wrap is None:
        self.compilation_config.ir_enable_torch_wrap = (
            self.compilation_config.mode == CompilationMode.VLLM_COMPILE
            and self.compilation_config.backend == "inductor"
        )

    if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
        if (
            self.compilation_config.backend == "inductor"
            and self.compilation_config.mode != CompilationMode.NONE
        ):
            self.compilation_config.custom_ops.append("none")
        else:
            self.compilation_config.custom_ops.append("all")

    # This populates IR op priorities,
    # must happen after compilation mode and backend are decided,
    # but before fusion defaults are applied as those may depend on op priority.
    self.kernel_config.set_platform_defaults(self)

    default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
    self._apply_optimization_level_defaults(default_config)
    if self.kernel_config.enable_flashinfer_autotune is None:
        raise ValueError(
            "KernelConfig.enable_flashinfer_autotune must be set after applying "
            "optimization level defaults."
        )

    self._maybe_override_dynamic_sd_cudagraph_mode()

    if (
        self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
        and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
        and not envs.VLLM_USE_BREAKABLE_CUDAGRAPH
    ):
        logger.info(
            "Cudagraph mode %s is not compatible with compilation mode %s."
            "Overriding to NONE.",
            self.compilation_config.cudagraph_mode,
            self.compilation_config.mode,
        )
        self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE

    # async tp is built on top of sequence parallelism and requires it.
    pass_config = self.compilation_config.pass_config
    if pass_config.fuse_gemm_comms:
        pass_config.enable_sp = True
    if pass_config.enable_sp:
        if self.parallel_config.tensor_parallel_size == 1:
            logger.warning("Sequence Parallelism requires TP>1, disabling")
            pass_config.enable_sp = False
            pass_config.fuse_gemm_comms = False
        else:
            if pass_config.sp_min_token_num is None:
                from vllm.compilation.passes.fusion.sequence_parallelism import (
                    get_sequence_parallelism_threshold,
                )

                tp_size = self.parallel_config.tensor_parallel_size
                hidden_size = self.model_config.get_hidden_size()
                assert isinstance(self.model_config.dtype, torch.dtype)
                element_size = self.model_config.dtype.itemsize
                pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                    hidden_size, tp_size, element_size
                )

            if pass_config.sp_min_token_num is None:
                logger.warning(
                    "Model hidden_size too small for the SP "
                    "threshold heuristic, disabling. To force SP, "
                    "set pass_config.sp_min_token_num manually."
                )
                pass_config.enable_sp = False
                pass_config.fuse_gemm_comms = False

    from vllm.utils.torch_utils import HAS_OPAQUE_TYPE

    if HAS_OPAQUE_TYPE:
        # On torch >= 2.11 the hoisted OpaqueObject approach supersedes
        # fast_moe_cold_start, so force it off.
        self.compilation_config.fast_moe_cold_start = False
    elif self.compilation_config.fast_moe_cold_start is None:
        # resolve default behavior: try to be as safe as possible
        # this config is unsafe if any spec decoding draft model has a MOE.
        # We'll conservatively turn it off if we see spec decoding.
        self.compilation_config.fast_moe_cold_start = (
            self.speculative_config is None
        )

    self._set_max_num_scheduled_tokens()

    if current_platform.support_static_graph_mode():
        # if cudagraph_mode has full cudagraphs, we need to check support
        if model_config := self.model_config:
            if (
                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
                and model_config.pooler_config is not None
            ):
                logger.warning_once(
                    "Pooling models do not support full cudagraphs. "
                    "Overriding cudagraph_mode to PIECEWISE."
                )
                self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
            elif (
                model_config.is_encoder_decoder
                and self.compilation_config.cudagraph_mode
                not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
            ):
                logger.info_once(
                    "Encoder-decoder models do not support %s. "
                    "Overriding cudagraph_mode to FULL_DECODE_ONLY.",
                    self.compilation_config.cudagraph_mode.name,
                )
                self.compilation_config.cudagraph_mode = (
                    CUDAGraphMode.FULL_DECODE_ONLY
                )

        # Check if KV connector requires PIECEWISE mode for CUDA graphs
        if (
            self.kv_transfer_config is not None
            and self.kv_transfer_config.is_kv_transfer_instance
            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
        ):
            # Lazy import to avoid circular dependencies
            from vllm.distributed.kv_transfer.kv_connector.factory import (
                KVConnectorFactory,
            )

            connector_cls = KVConnectorFactory.get_connector_class(
                self.kv_transfer_config
            )
            if connector_cls.requires_piecewise_for_cudagraph(
                self.kv_transfer_config.kv_connector_extra_config
            ):
                logger.warning_once(
                    "KV connector %s requires PIECEWISE CUDA graph mode "
                    "due to layerwise async operations that cannot be "
                    "captured in CUDA graphs. "
                    "Overriding cudagraph_mode from %s to PIECEWISE.",
                    connector_cls.__name__,
                    self.compilation_config.cudagraph_mode.name,
                )
                self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

        # disable cudagraph when enforce eager execution
        if self.model_config is not None and self.model_config.enforce_eager:
            logger.info("Cudagraph is disabled under eager mode")
            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            # override related settings when enforce eager
            self.compilation_config.max_cudagraph_capture_size = 0
            self.compilation_config.cudagraph_capture_sizes = []
        else:
            self.compilation_config.cudagraph_num_of_warmups = 1

        self._set_cudagraph_sizes()

    else:
        self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE

    if self.cache_config.kv_sharing_fast_prefill:
        if (
            self.speculative_config is not None
            and self.speculative_config.use_eagle()
        ):
            raise ValueError(
                "Fast prefill optimization for KV sharing is not "
                "compatible with EAGLE as EAGLE requires correct logits "
                "for all tokens while fast prefill gives incorrect logits "
                "for prompt tokens."
            )

        logger.warning_once(
            "--kv-sharing-fast-prefill requires changes on model side for "
            "correctness and to realize prefill savings."
        )

    if (
        self.model_config
        and self.model_config.architecture == "WhisperForConditionalGeneration"
        and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
    ):
        logger.warning(
            "Whisper is known to have issues with "
            "forked workers. If startup is hanging, "
            "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
            "to 'spawn'."
        )

    if (
        self.kv_events_config is not None
        and self.kv_events_config.enable_kv_cache_events
        and not self.cache_config.enable_prefix_caching
    ):
        logger.warning(
            "KV cache events are on, but prefix caching is not enabled. "
            "Use --enable-prefix-caching to enable."
        )
    if (
        self.kv_events_config is not None
        and self.kv_events_config.publisher != "null"
        and not self.kv_events_config.enable_kv_cache_events
    ):
        logger.warning(
            "KV cache events are disabled, "
            "but the scheduler is configured to publish them. "
            "Modify KVEventsConfig.enable_kv_cache_events "
            "to True to enable."
        )
    current_platform.check_and_update_config(self)

    if self.use_v2_model_runner:
        self._validate_v2_model_runner()

    # Re-compute compile ranges after platform-specific config updates
    # (e.g., XPU may lower max_num_batched_tokens when MLA is enabled)
    self._set_compile_ranges()

    # Do this after all the updates to compilation_config.mode
    effective_dp_size = (
        self.parallel_config.data_parallel_size
        if self.model_config is None or self.model_config.is_moe
        else 1
    )
    self.compilation_config.set_splitting_ops_for_v1(
        all2all_backend=self.parallel_config.all2all_backend,
        data_parallel_size=effective_dp_size,
    )

    if self.compilation_config.pass_config.enable_sp:
        # With pipeline parallelism, native rms norm tracing errors due to
        # incorrect residual shape.
        # Use custom rms norm to unblock. In the future,
        # the pass will operate on higher-level IR to avoid the issue.
        # TODO: https://github.com/vllm-project/vllm/issues/27894
        if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
            logger.warning(
                "Sequence parallelism is enabled, but running in wrong "
                "vllm compile mode: %s.",
                self.compilation_config.mode,
            )

        if self.parallel_config.pipeline_parallel_size > 1:
            if "-rms_norm" not in self.compilation_config.custom_ops:
                self.compilation_config.custom_ops.append("+rms_norm")
            else:
                logger.warning_once(
                    "Sequence parallelism not supported with "
                    "native rms_norm when using %s, "
                    "this will likely lead to an error.",
                    "pipeline parallelism",
                )

    # final check of cudagraph mode after all possible updates
    if current_platform.is_cuda_alike():
        if (
            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
            and self.model_config is not None
            and not self.model_config.disable_cascade_attn
            and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()  # noqa: E501
        ):
            logger.warning_once(
                "No piecewise cudagraph for executing cascade attention. "
                "Will fall back to eager execution if a batch runs into "
                "cascade attentions."
            )

        if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
            assert (
                self.compilation_config.mode == CompilationMode.VLLM_COMPILE
                or envs.VLLM_USE_BREAKABLE_CUDAGRAPH
            ), (
                "Compilation mode should be CompilationMode.VLLM_COMPILE "
                "when cudagraph_mode piecewise cudagraphs is used, "
                f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
            )
    if (
        self.model_config
        and envs.VLLM_BATCH_INVARIANT
        and not self.model_config.disable_cascade_attn
    ):
        self.model_config.disable_cascade_attn = True
        logger.warning_once(
            "Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
        )

    if self.parallel_config.use_ubatching:
        a2a_backend = self.parallel_config.all2all_backend
        assert a2a_backend in [
            "deepep_low_latency",
            "deepep_high_throughput",
            "nixl_ep",
        ], (
            "Microbatching currently only supports the deepep_low_latency, "
            "deepep_high_throughput, and nixl_ep all2all backends. "
            f"{a2a_backend} is not supported. To fix use "
            "--all2all-backend=deepep_low_latency, "
            "--all2all-backend=deepep_high_throughput, or "
            "--all2all-backend=nixl_ep and install the matching kernels."
        )

        if not self.model_config.disable_cascade_attn:
            self.model_config.disable_cascade_attn = True
            logger.warning_once("Disabling cascade attention when DBO is enabled.")

    if not self.instance_id:
        self.instance_id = random_uuid()[:5]

    if self.reasoning_config is not None and self.model_config is not None:
        self.reasoning_config.initialize_token_ids(self.model_config)
        if not self.reasoning_config.enabled:
            logger.warning_once(
                "Auto-initialization of reasoning token IDs failed. "
                "Please check whether your reasoning parser has implemented "
                "the `reasoning_start_str` and `reasoning_end_str`."
            )

    # Resolve kv_offloading-derived connector name into kv_transfer_config
    # before the HMA check below, which inspects the connector class.
    self._post_init_kv_transfer_config()

    # Hybrid KV cache manager (HMA) runtime rules:
    # - Explicit enable (--no-disable-kv-cache-manager): error if runtime
    #   disables it
    # - No preference: auto-disable for unsupported features or connector configs
    # - Explicit disable (--disable-kv-cache-manager): always respect it
    need_disable_hybrid_kv_cache_manager = False
    # logger should only print warning message for hybrid models. As we
    # can't know whether the model is hybrid or not now, so we don't log
    # warning message here and will log it later.
    if not current_platform.support_hybrid_kv_cache():
        # Hybrid KV cache manager is not supported on non-GPU platforms.
        need_disable_hybrid_kv_cache_manager = True
    if (
        self.model_config is not None
        and self.model_config.attention_chunk_size is not None
    ):
        if (
            self.speculative_config is not None
            and self.speculative_config.use_eagle()
        ):
            # Hybrid KV cache manager is not yet supported with chunked
            # local attention + eagle.
            need_disable_hybrid_kv_cache_manager = True
        elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
            logger.warning(
                "There is a latency regression when using chunked local"
                " attention with the hybrid KV cache manager. Disabling"
                " it, by default. To enable it, set the environment "
                "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
            )
            # Hybrid KV cache manager is not yet supported with chunked
            # local attention.
            need_disable_hybrid_kv_cache_manager = True

    if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
        # Auto-disable HMA only when the connector config does not support it.
        if self.kv_transfer_config is not None:
            from vllm.distributed.kv_transfer.kv_connector.factory import (
                KVConnectorFactory,
            )

            if not KVConnectorFactory.supports_hma_config(self.kv_transfer_config):
                need_disable_hybrid_kv_cache_manager = True
                logger.warning(
                    "Turning off hybrid kv cache manager because "
                    "`--kv-transfer-config` selects a KV connector that "
                    "does not support it. Impact: hybrid SSM models "
                    "(e.g. Jamba, Bamba) require HMA and will fail at "
                    "startup without it; models with sliding window "
                    "attention will run with reduced performance. "
                    "To add HMA support to a KV connector, subclass "
                    "`SupportsHMA` defined in kv_connector/v1/base.py "
                    "(for MultiConnector, all child connectors must "
                    "support HMA)."
                )
        self.scheduler_config.disable_hybrid_kv_cache_manager = (
            need_disable_hybrid_kv_cache_manager
        )
    elif (
        self.scheduler_config.disable_hybrid_kv_cache_manager is False
        and need_disable_hybrid_kv_cache_manager
    ):
        raise ValueError(
            "Hybrid KV cache manager was explicitly enabled but is not "
            "supported in this configuration. Consider omitting the "
            "--no-disable-hybrid-kv-cache-manager flag to let vLLM decide"
            " automatically."
        )

    if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
        # Default to enable HMA if not explicitly disabled by user or logic above.
        self.scheduler_config.disable_hybrid_kv_cache_manager = False

    if self.compilation_config.debug_dump_path:
        self.compilation_config.debug_dump_path = (
            self.compilation_config.debug_dump_path.absolute().expanduser()
        )
    if envs.VLLM_DEBUG_DUMP_PATH is not None:
        env_path = Path(envs.VLLM_DEBUG_DUMP_PATH).absolute().expanduser()
        if self.compilation_config.debug_dump_path:
            logger.warning(
                "Config-specified debug dump path is overridden"
                " by VLLM_DEBUG_DUMP_PATH to %s",
                env_path,
            )
        self.compilation_config.debug_dump_path = env_path

    # Enable quant_fp8 CUDA ops (TODO disable in follow up)
    # On H100 the CUDA kernel is faster than
    # native implementation
    # https://github.com/vllm-project/vllm/issues/25094
    if has_blocked_weights():
        custom_ops = self.compilation_config.custom_ops
        if "-quant_fp8" not in custom_ops:
            custom_ops.append("+quant_fp8")

    self._verify_kv_transfer_compat()
    # Log the custom passes that are enabled
    self.compilation_config.pass_config.log_enabled_passes()

_apply_optimization_level_defaults(defaults)

Apply optimization level defaults using self as root.

Recursively applies values from defaults into nested config objects. Only fields present in defaults are overwritten.

If the user configuration does not specify a value for a default field and if the default field is still None after all user selections are applied, then default values will be applied to the field. User specified fields will not be overridden by the default.

Parameters:

  • defaults

    (dict[str, Any]) –

    Dictionary of default values to apply.

Source code in vllm/config/vllm.py
def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
    """Apply optimization level defaults using self as root.

    Recursively applies values from defaults into nested config objects.
    Only fields present in defaults are overwritten.

    If the user configuration does not specify a value for a default field
    and if the default field is still None after all user selections are
    applied, then default values will be applied to the field. User specified
    fields will not be overridden by the default.

    Args:
        defaults: Dictionary of default values to apply.
    """

    def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
        """Recursively apply defaults to config_obj, using self as root."""
        for key, value in config_defaults.items():
            if not hasattr(config_obj, key):
                continue

            current = getattr(config_obj, key)
            if isinstance(value, dict) and is_dataclass(current):
                apply_recursive(current, value)
            else:
                self._set_config_default(config_obj, key, value)

    apply_recursive(self, defaults)

_get_quantization_config(model_config, load_config) staticmethod

Get the quantization config.

Source code in vllm/config/vllm.py
@staticmethod
def _get_quantization_config(
    model_config: ModelConfig, load_config: LoadConfig
) -> QuantizationConfig | None:
    """Get the quantization config."""
    from vllm.platforms import current_platform

    if model_config.quantization is not None:
        from vllm.model_executor.model_loader.weight_utils import get_quant_config

        quant_config = get_quant_config(model_config, load_config)
        capability_tuple = current_platform.get_device_capability()

        if capability_tuple is not None:
            capability = capability_tuple.to_int()
            if capability < quant_config.get_min_capability():
                raise ValueError(
                    f"The quantization method {model_config.quantization} "
                    "is not supported for the current GPU. Minimum "
                    f"capability: {quant_config.get_min_capability()}. "
                    f"Current capability: {capability}."
                )
        supported_dtypes = quant_config.get_supported_act_dtypes()
        if model_config.dtype not in supported_dtypes:
            raise ValueError(
                f"{model_config.dtype} is not supported for quantization "
                f"method {model_config.quantization}. Supported dtypes: "
                f"{supported_dtypes}"
            )
        quant_config.maybe_update_config(
            model_config.model,
            hf_config=model_config.hf_config,
        )
        return quant_config
    return None

_get_v2_model_runner_unsupported_features()

Collect features not yet supported by the V2 model runner.

Source code in vllm/config/vllm.py
def _get_v2_model_runner_unsupported_features(self) -> list[str]:
    """Collect features not yet supported by the V2 model runner."""
    unsupported: list[str] = []
    model_config = self.model_config
    speculative_config = self.speculative_config

    if self.parallel_config.prefill_context_parallel_size > 1:
        unsupported.append("prefill context parallelism")

    if self.compilation_config.mode == CompilationMode.STOCK_TORCH_COMPILE:
        unsupported.append("stock torch.compile")

    if (
        self.compilation_config.pass_config.enable_sp
        and self.parallel_config.tensor_parallel_size > 1
    ):
        unsupported.append("sequence parallelism")

    # V2 does not implement the external_launcher (torchrun) PP-output
    # broadcast that V1 uses to keep all ranks in sync (broadcast_pp_output).
    if (
        self.parallel_config.distributed_executor_backend == "external_launcher"
        and self.parallel_config.pipeline_parallel_size > 1
    ):
        unsupported.append("pipeline parallelism with external_launcher")

    if speculative_config is not None:
        # TODO: ngram / ngram_gpu are not supported by the v2 model runner yet
        if speculative_config.method in ("ngram", "ngram_gpu"):
            unsupported.append("ngram/ngram_gpu speculative decoding")
        elif speculative_config.method not in ("eagle", "eagle3", "mtp", "dflash"):
            unsupported.append(f"speculative method '{speculative_config.method}'")

        if speculative_config.uses_dynamic_speculative_decoding():
            unsupported.append("dynamic speculative decoding")

        # V2 EagleSpeculator does not support parallel_drafting (for P-Eagle)
        # DFlash uses parallel drafting natively in V2 via DFlashSpeculator.
        if (
            speculative_config.parallel_drafting
            and speculative_config.method != "dflash"
        ):
            unsupported.append("parallel drafting for EAGLE speculative decoding")

        if (
            speculative_config.method == "eagle3"
            and self.parallel_config.pipeline_parallel_size > 1
        ):
            unsupported.append("EAGLE3 with pipeline parallelism")

    if self.parallel_config.enable_dbo:
        unsupported.append("dual batch overlap")

    if self.parallel_config.enable_elastic_ep:
        unsupported.append("elastic expert parallelism")

    if model_config is not None and model_config.enable_return_routed_experts:
        # Will be added by https://github.com/vllm-project/vllm/pull/38163
        unsupported.append("routed experts capture")

    has_logitsproc_plugins = False
    if model_config is not None:
        from importlib.metadata import entry_points

        has_logitsproc_plugins = bool(entry_points(group="vllm.logits_processors"))

    if model_config is not None and (
        model_config.logits_processors or has_logitsproc_plugins
    ):
        unsupported.append("custom logits processors")

    if model_config is not None and model_config.enable_prompt_embeds:
        unsupported.append("prompt embeds")

    if (
        model_config is not None
        and model_config.runner_type == "generate"
        and model_config.logprobs_mode in ("raw_logits", "processed_logits")
    ):
        unsupported.append(f"logprobs mode '{model_config.logprobs_mode}'")

    if self.cache_config.kv_sharing_fast_prefill:
        # Will be added by https://github.com/vllm-project/vllm/pull/35045
        unsupported.append("KV sharing fast prefill")

    if self.ec_transfer_config is not None:
        # Will be added by https://github.com/vllm-project/vllm/pull/38390
        unsupported.append("EC transfer")

    return unsupported

_post_init_kv_transfer_config()

Update KVTransferConfig based on top-level configs in VllmConfig.

Right now, this function reads the offloading settings from CacheConfig and configures the KVTransferConfig accordingly.

Source code in vllm/config/vllm.py
def _post_init_kv_transfer_config(self) -> None:
    """Update KVTransferConfig based on top-level configs in VllmConfig.

    Right now, this function reads the offloading settings from
    CacheConfig and configures the KVTransferConfig accordingly.
    """
    # KV offloading is only activated when kv_offloading_size is set.
    if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
        return

    kv_offloading_backend = self.cache_config.kv_offloading_backend

    # If no KVTransferConfig is provided, create a default one.
    if self.kv_transfer_config is None:
        self.kv_transfer_config = KVTransferConfig()

    if kv_offloading_backend == "native":
        if envs.VLLM_USE_SIMPLE_KV_OFFLOAD:
            config_connector = "SimpleCPUOffloadConnector"
        else:
            config_connector = "OffloadingConnector"
        self.kv_transfer_config.kv_connector = config_connector
        self.kv_transfer_config.kv_connector_extra_config.update(
            {"cpu_bytes_to_use": kv_offloading_size * (1 << 30)}
        )
    elif kv_offloading_backend == "lmcache":
        # Default to LMCache multi-process (MP) mode. The actual KV
        # storage capacity is managed by the standalone LMCache server
        # process, so ``kv_offloading_size`` is not propagated here.
        # ``LMCacheMPConnector`` falls back to ``tcp://localhost:5555``
        # when host/port are not provided via extra_config.
        self.kv_transfer_config.kv_connector = "LMCacheMPConnector"

    # This is the same for all backends
    self.kv_transfer_config.kv_role = "kv_both"

_set_compile_ranges()

Set the compile ranges for the compilation config.

Source code in vllm/config/vllm.py
def _set_compile_ranges(self):
    """
    Set the compile ranges for the compilation config.
    """
    compilation_config = self.compilation_config
    computed_compile_ranges_endpoints = []

    # The upper bound of the compile ranges is the max_num_batched_tokens.
    compile_range_end = self.scheduler_config.max_num_batched_tokens
    if compile_range_end is not None:
        computed_compile_ranges_endpoints.append(compile_range_end)

    # Add the compile ranges for flashinfer/aiter.
    if compilation_config.pass_config.fuse_allreduce_rms:
        tp_size = self.parallel_config.tensor_parallel_size
        from vllm._aiter_ops import rocm_aiter_ops

        if rocm_aiter_ops.is_enabled():
            max_size = rocm_aiter_ops.get_aiter_allreduce_max_size()
        else:
            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
        if max_size is not None and self.model_config is not None:
            assert isinstance(self.model_config.dtype, torch.dtype)
            max_token_num = max_size // (
                self.model_config.get_hidden_size()
                * self.model_config.dtype.itemsize
            )
            if compile_range_end is not None and max_token_num < compile_range_end:
                computed_compile_ranges_endpoints.append(max_token_num)
            else:
                logger.debug(
                    "Max num batched tokens below allreduce-rms fusion threshold, "
                    "allreduce-rms fusion will be enabled for all num_tokens."
                )

    # Add the compile ranges for sequence parallelism
    if compilation_config.pass_config.enable_sp:
        pass_config = compilation_config.pass_config

        # Calculate min_token_num if not explicitly provided
        # User override works regardless of hidden_size
        if pass_config.sp_min_token_num is None:
            from vllm.compilation.passes.fusion.sequence_parallelism import (
                get_sequence_parallelism_threshold,
            )

            tp_size = self.parallel_config.tensor_parallel_size
            hidden_size = self.model_config.get_hidden_size()
            assert isinstance(self.model_config.dtype, torch.dtype)
            element_size = self.model_config.dtype.itemsize
            pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                hidden_size, tp_size, element_size
            )

        min_token_num = pass_config.sp_min_token_num
        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
        if min_token_num is not None and (
            max_num_batched_tokens is not None
            and min_token_num < max_num_batched_tokens
            and min_token_num > 1
        ):
            # Add endpoint at min_token_num - 1 to ensure SP applies
            # starting from min_token_num
            # This creates ranges: [1, min-1] (no SP), [min, max] (SP applies)
            computed_compile_ranges_endpoints.append(min_token_num - 1)

    if compilation_config.pass_config.fuse_rope_kvcache:
        max_token_num = (
            compilation_config.pass_config.rope_kvcache_fusion_max_token_num
        )
        if max_token_num is not None:
            if compile_range_end is not None and max_token_num < compile_range_end:
                computed_compile_ranges_endpoints.append(max_token_num)
            else:
                logger.debug(
                    "Max num batched tokens below rope+kvcache fusion threshold, "
                    "rope+kvcache fusion enabled for num_tokens <= %d.",
                    compile_range_end,
                )

    if compilation_config.compile_ranges_endpoints is not None:
        for x in compilation_config.compile_ranges_endpoints:
            assert isinstance(x, int)
            assert x > 0, f"Invalid compile range endpoint: {x}"
            if compile_range_end is not None and x < compile_range_end and x > 1:
                computed_compile_ranges_endpoints.append(x)
    compilation_config.compile_ranges_endpoints = sorted(
        computed_compile_ranges_endpoints
    )

_set_config_default(config_obj, key, value)

Set config attribute to default if not already set by user.

Parameters:

  • config_obj

    (Any) –

    Configuration object to update.

  • key

    (str) –

    Attribute name.

  • value

    (Any) –

    Default value (static or callable).

Source code in vllm/config/vllm.py
def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
    """Set config attribute to default if not already set by user.

    Args:
        config_obj: Configuration object to update.
        key: Attribute name.
        value: Default value (static or callable).
    """
    if getattr(config_obj, key) is None:
        # Some config values are known before initialization and are
        # hard coded.
        # Other values depend on the user given configuration, so they are
        # implemented with lambda functions and decided at run time.
        setattr(config_obj, key, value(self) if callable(value) else value)

_set_cudagraph_sizes()

vLLM defines the default candidate list of batch sizes for CUDA graph capture as:

```python max_graph_size = min(max_num_seqs * 2, 512)

1, 2, 4, then multiples of 8 up to 256 and then multiples of 16

up to max_graph_size

cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( range(256, max_graph_size + 1, 16))

max_num_batched_tokens is also appended to the list if it fits within max_cudagraph_capture_size, so the max batch size is captured even when off-stride.

In the end, vllm_config.compilation_config.cudagraph_capture_sizes will be the final sizes to capture cudagraph (in ascending order).

These sizes are used to capture and reuse CUDA graphs for performance-critical paths (e.g., decoding). Capturing enables significantly faster kernel dispatch by avoiding Python overhead. The list is then filtered based on max_num_batched_tokens (e.g., 8192 on most GPUs), which controls the total allowed number of tokens in a batch. Since each sequence may have a variable number of tokens, the maximum usable batch size will depend on actual sequence lengths.

Example: With max_num_batched_tokens = 8192, and typical sequences averaging ~32 tokens, most practical batch sizes fall below 256. However, the system will still allow capture sizes up to 512 if shape and memory permit.

Note: If users explicitly specify cudagraph capture sizes in the compilation config, those will override this default logic. At runtime:

- If batch size <= one of the `cudagraph_capture_sizes`, the closest
padded CUDA graph will be used.
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
not be used.
Source code in vllm/config/vllm.py
def _set_cudagraph_sizes(self):
    """
    vLLM defines the default candidate list of batch sizes for CUDA graph
    capture as:

    ```python
    max_graph_size = min(max_num_seqs * 2, 512)
    # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
    # up to max_graph_size
    cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
        range(256, max_graph_size + 1, 16))

    `max_num_batched_tokens` is also appended to the list if it fits
    within `max_cudagraph_capture_size`, so the max batch size is captured
    even when off-stride.

    In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
    will be the final sizes to capture cudagraph (in ascending order).

    These sizes are used to capture and reuse CUDA graphs for
    performance-critical paths (e.g., decoding). Capturing enables
    significantly faster kernel dispatch by avoiding Python overhead. The
    list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
    most GPUs), which controls the total allowed number of tokens in a
    batch. Since each sequence may have a variable number of tokens, the
    maximum usable batch size will depend on actual sequence lengths.

    Example:
        With `max_num_batched_tokens = 8192`, and typical sequences
        averaging ~32 tokens, most practical batch sizes fall below 256.
        However, the system will still allow capture sizes up to 512 if
        shape and memory permit.

    Note:
        If users explicitly specify cudagraph capture sizes in the
        compilation config, those will override this default logic.
        At runtime:

        - If batch size <= one of the `cudagraph_capture_sizes`, the closest
        padded CUDA graph will be used.
        - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
        not be used.
    """

    if (
        self.model_config is not None
        and not self.model_config.enforce_eager
        and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
    ):
        # determine the initial max_cudagraph_capture_size
        max_cudagraph_capture_size = (
            self.compilation_config.max_cudagraph_capture_size
        )
        if max_cudagraph_capture_size is None:
            decode_query_len = 1 + self.num_speculative_tokens
            max_cudagraph_capture_size = min(
                self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
            )
        max_num_tokens = self.scheduler_config.max_num_batched_tokens
        max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)

        assert max_cudagraph_capture_size >= 1, (
            "Maximum cudagraph size should be greater than or equal to 1 "
            "when using cuda graph."
        )

        # determine the cudagraph_capture_sizes
        if self.compilation_config.cudagraph_capture_sizes is not None:
            assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
                "cudagraph_capture_sizes should contain at least one element "
                "when using cuda graph."
            )
            # de-duplicate the sizes provided by the config
            dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
            cudagraph_capture_sizes = [
                i for i in dedup_sizes if i <= max_num_tokens
            ]
            # sort to make sure the sizes are in ascending order
            cudagraph_capture_sizes.sort()
        else:
            if self.performance_mode == "interactivity":
                # Fine-grained CUDA graphs at small batch sizes
                # for minimal padding overhead
                interactivity_max = min(max_cudagraph_capture_size, 32)
                cudagraph_capture_sizes = list(range(1, interactivity_max + 1))
            else:
                cudagraph_capture_sizes = [
                    i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
                ]
            if max_cudagraph_capture_size >= 8:
                # Step size 8 for small batch sizes, up to 256(not included)
                cudagraph_capture_sizes += list(
                    range(8, min(max_cudagraph_capture_size + 1, 256), 8)
                )
            if max_cudagraph_capture_size >= 256:
                # Step size 16 for larger batch sizes
                cudagraph_capture_sizes += list(
                    range(256, max_cudagraph_capture_size + 1, 16)
                )
            # ensure max_num_tokens is captured if within max capture size
            if (
                max_num_tokens <= max_cudagraph_capture_size
                and max_num_tokens not in cudagraph_capture_sizes
            ):
                cudagraph_capture_sizes.append(max_num_tokens)
            # de-duplicate and sort the sizes
            cudagraph_capture_sizes = sorted(set(cudagraph_capture_sizes))

        if (
            self.parallel_config.tensor_parallel_size > 1
            and self.compilation_config.pass_config.enable_sp
        ):
            cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
                cudagraph_capture_sizes
            )

        # user-specific compilation_config.max_cudagraph_capture_size get
        # truncated to valid_max_size when they are inconsistent.
        valid_max_size = (
            cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
        )
        if (
            self.compilation_config.max_cudagraph_capture_size is not None
            and self.compilation_config.max_cudagraph_capture_size != valid_max_size
        ):
            # raise error only when both two flags are user-specified
            # and they are inconsistent with each other
            if self.compilation_config.cudagraph_capture_sizes is not None:
                raise ValueError(
                    "customized max_cudagraph_capture_size"
                    f"(={self.compilation_config.max_cudagraph_capture_size}) "
                    "should be consistent with the max value of "
                    f"cudagraph_capture_sizes(={valid_max_size})"
                )

            logger.warning(
                "Truncating max_cudagraph_capture_size to %d",
                valid_max_size,
            )
        # always set the final max_cudagraph_capture_size
        self.compilation_config.max_cudagraph_capture_size = valid_max_size

        if self.compilation_config.cudagraph_capture_sizes is not None and len(
            cudagraph_capture_sizes
        ) < len(self.compilation_config.cudagraph_capture_sizes):
            # If users have specified capture sizes, we only need to
            # compare the lens before and after modification since the modified
            # list is only the subset of the original list.
            logger.warning(
                (
                    "cudagraph_capture_sizes specified in compilation_config"
                    " %s is overridden by config %s"
                ),
                self.compilation_config.cudagraph_capture_sizes,
                cudagraph_capture_sizes,
            )
        # always write back the final sizes
        self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes

    else:
        # no cudagraph in use
        self.compilation_config.max_cudagraph_capture_size = 0
        self.compilation_config.cudagraph_capture_sizes = []

    # complete the remaining process.
    self.compilation_config.post_init_cudagraph_sizes()

_set_max_num_scheduled_tokens()

In most cases, the scheduler may schedule a batch with as many tokens as the worker is configured to handle. However for some speculative decoding methods, the drafter model may insert additional slots into the batch when drafting. To account for this, we need to decrease the max_num_scheduled_tokens by an upper bound on the number of slots that can be added.

Source code in vllm/config/vllm.py
def _set_max_num_scheduled_tokens(self):
    """
    In most cases, the scheduler may schedule a batch with as many tokens as the
    worker is configured to handle. However for some speculative decoding methods,
    the drafter model may insert additional slots into the batch when drafting.
    To account for this, we need to decrease the max_num_scheduled_tokens by an
    upper bound on the number of slots that can be added.
    """
    if self.speculative_config is not None:
        scheduled_token_delta = (
            self.speculative_config.max_num_new_slots_for_drafting
            * self.scheduler_config.max_num_seqs
        )
        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
        if self.scheduler_config.max_num_scheduled_tokens is None:
            self.scheduler_config.max_num_scheduled_tokens = (
                max_num_batched_tokens - scheduled_token_delta
            )

        if self.scheduler_config.max_num_scheduled_tokens <= 0:
            raise ValueError(
                "max_num_scheduled_tokens is set to"
                f" {self.scheduler_config.max_num_scheduled_tokens} based on"
                " the speculative decoding settings, which does not allow"
                " any tokens to be scheduled. Increase max_num_batched_tokens"
                " to accommodate the additional draft token slots, or decrease"
                " num_speculative_tokens or max_num_seqs."
            )
        if self.scheduler_config.max_num_scheduled_tokens < 8192:
            logger.warning_once(
                "max_num_scheduled_tokens is set to"
                f" {self.scheduler_config.max_num_scheduled_tokens} based on"
                " the speculative decoding settings. This may lead to suboptimal"
                " performance. Consider increasing max_num_batched_tokens to"
                " accommodate the additional draft token slots, or decrease"
                " num_speculative_tokens or max_num_seqs.",
            )

        max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens
        if max_num_batched_tokens < max_num_scheduled_tokens + (
            self.speculative_config.max_num_new_slots_for_drafting
            * self.scheduler_config.max_num_seqs
        ):
            raise ValueError(
                f"VllmConfig received max_num_scheduled_tokens but it does not have"
                " enough slots to support the speculative decoding settings."
                f" It should be greater by at least {scheduled_token_delta}, but"
                f" got {max_num_batched_tokens=} and {max_num_scheduled_tokens=}."
            )

_validate_v2_model_runner()

Check for features not yet supported by the V2 model runner.

Source code in vllm/config/vllm.py
def _validate_v2_model_runner(self) -> None:
    """Check for features not yet supported by the V2 model runner."""
    if not HAS_TRITON:
        raise ValueError("Model Runner V2 requires Triton.")

    unsupported = self._get_v2_model_runner_unsupported_features()
    if unsupported:
        raise ValueError(
            f"Model Runner V2 does not yet support: {', '.join(unsupported)}"
        )

    if self.reasoning_config is not None:
        logger.warning_once(
            "Model Runner V2 does not yet support the thinking_token_budget "
            "request parameter. Set VLLM_USE_V2_MODEL_RUNNER=0 if this is required."
        )

_verify_kv_transfer_compat()

Reject configurations that silently corrupt KV transfers.

Source code in vllm/config/vllm.py
def _verify_kv_transfer_compat(self) -> None:
    """Reject configurations that silently corrupt KV transfers."""
    if (
        self.kv_transfer_config is None
        or self.kv_transfer_config.kv_connector is None
    ):
        return

    # PyTorch's expandable_segments allocator uses CUDA VMM, which can
    # remap a virtual address range to different physical pages over the
    # engine's lifetime. KV connectors that pin KV cache memory (e.g.
    # NixlConnector via ibv_reg_mr, MooncakeConnector) end up with their
    # registrations pointing at stale physical pages after any remap,
    # producing RDMA failures like IBV_WC_REM_ACCESS_ERR /
    # NIXL_ERR_REMOTE_DISCONNECT at the first inter-node KV transfer.
    # We can't enumerate every in-tree and out-of-tree connector that
    # pins memory, so we conservatively reject the combination whenever
    # any KV connector is configured.
    #
    # CuMem allocator is exempt: CuMemAllocator.use_memory_pool toggles
    # expandable_segments off around its pool (see #40812), so the KV
    # cache allocated within that context lands on stable physical pages
    # even when the env var is set.
    if "expandable_segments:True" not in os.environ.get(
        "PYTORCH_CUDA_ALLOC_CONF", ""
    ):
        return
    if self.model_config is not None and (self.model_config.enable_cumem_allocator):
        return

    raise ValueError(
        f"KV connector {self.kv_transfer_config.kv_connector} is "
        "incompatible with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True "
        "unless enable_cumem_allocator is also enabled. PyTorch's CUDA VMM "
        "allocator can remap KV cache virtual addresses to different "
        "physical pages, invalidating any pinned/registered KV memory "
        "(e.g. IB memory regions registered by NIXL or Mooncake). Either "
        "unset expandable_segments:True or enable the cumem allocator "
        "(sleep mode does this automatically and also "
        "routes KV allocations through CuMemAllocator's pool, where "
        "expandable_segments is automatically disabled)."
    )

compile_debug_dump_path()

Returns a rank-aware path for dumping torch.compile debug information.

Source code in vllm/config/vllm.py
def compile_debug_dump_path(self) -> Path | None:
    """Returns a rank-aware path for dumping
    torch.compile debug information.
    """
    if self.compilation_config.debug_dump_path is None:
        return None
    tp_rank = self.parallel_config.rank
    dp_rank = self.parallel_config.data_parallel_index
    append_path = f"rank_{tp_rank}_dp_{dp_rank}"
    path = self.compilation_config.debug_dump_path / append_path
    return path

compute_hash()

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/vllm.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    factors: list[Any] = []

    # summarize vllm config
    vllm_factors: list[Any] = []
    from vllm import __version__

    vllm_factors.append(__version__)
    if self.model_config:
        vllm_factors.append(self.model_config.compute_hash())
        if (
            self.compilation_config
            and getattr(self.compilation_config, "compile_mm_encoder", False)
            and self.model_config.multimodal_config
        ):
            vllm_factors.append(self.model_config.multimodal_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.cache_config:
        vllm_factors.append(self.cache_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.parallel_config:
        vllm_factors.append(self.parallel_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.scheduler_config:
        vllm_factors.append(self.scheduler_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.device_config:
        vllm_factors.append(self.device_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.load_config:
        vllm_factors.append(self.load_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.offload_config:
        vllm_factors.append(self.offload_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.attention_config:
        vllm_factors.append(self.attention_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.lora_config:
        vllm_factors.append(self.lora_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.speculative_config:
        vllm_factors.append(self.speculative_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.structured_outputs_config:
        vllm_factors.append(self.structured_outputs_config.compute_hash())
    if self.profiler_config:
        vllm_factors.append(self.profiler_config.compute_hash())
    else:
        vllm_factors.append("None")
    vllm_factors.append(self.observability_config.compute_hash())
    if self.quant_config:
        pass  # should be captured by model_config.quantization
    if self.compilation_config:
        vllm_factors.append(self.compilation_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.kernel_config:
        vllm_factors.append(self.kernel_config.compute_hash())
    else:
        vllm_factors.append(None)
    if self.kv_transfer_config:
        vllm_factors.append(self.kv_transfer_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.ec_transfer_config:
        vllm_factors.append(self.ec_transfer_config.compute_hash())
    else:
        vllm_factors.append("None")
    if self.additional_config:
        if isinstance(additional_config := self.additional_config, dict):
            additional_config_hash = safe_hash(
                json.dumps(additional_config, sort_keys=True).encode(),
                usedforsecurity=False,
            ).hexdigest()
        else:
            additional_config_hash = additional_config.compute_hash()
        vllm_factors.append(additional_config_hash)
    else:
        vllm_factors.append("None")
    factors.append(vllm_factors)

    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
        :10
    ]
    return hash_str

enable_trace_function_call_for_thread()

Set up function tracing for the current thread, if enabled via the VLLM_TRACE_FUNCTION environment variable.

Source code in vllm/config/vllm.py
def enable_trace_function_call_for_thread(self) -> None:
    """
    Set up function tracing for the current thread,
    if enabled via the `VLLM_TRACE_FUNCTION` environment variable.
    """
    if envs.VLLM_TRACE_FUNCTION:
        tmp_dir = tempfile.gettempdir()
        # add username to tmp_dir to avoid permission issues
        tmp_dir = os.path.join(tmp_dir, getpass.getuser())
        filename = (
            f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
            f"_thread_{threading.get_ident()}_at_{datetime.now()}.log"
        ).replace(" ", "_")
        log_path = os.path.join(
            tmp_dir,
            "vllm",
            f"vllm-instance-{self.instance_id}",
            filename,
        )
        os.makedirs(os.path.dirname(log_path), exist_ok=True)
        enable_trace_function_call(log_path)

validate_block_size()

Validate block_size against DCP and mamba constraints.

Called after Platform.update_block_size_for_backend() has finalised block_size.

Source code in vllm/config/vllm.py
def validate_block_size(self) -> None:
    """Validate block_size against DCP and mamba constraints.

    Called after Platform.update_block_size_for_backend() has
    finalised block_size.
    """
    block_size = self.cache_config.block_size

    # DCP interleave-size compatibility
    if self.parallel_config.decode_context_parallel_size > 1:
        if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
            self.parallel_config.cp_kv_cache_interleave_size
            != self.parallel_config.dcp_kv_cache_interleave_size
        ):
            self.parallel_config.cp_kv_cache_interleave_size = (
                self.parallel_config.dcp_kv_cache_interleave_size
            )
            logger.warning_once(
                "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
                "_interleave_size. And dcp-kv-cache-interleave-size will be "
                "deprecated when PCP is fully supported."
            )
        assert (
            self.parallel_config.cp_kv_cache_interleave_size <= block_size
            and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
        ), (
            f"Block_size({block_size}) should be greater "
            "than or equal to and divisible by cp_kv_cache_interleave_size "
            f"({self.parallel_config.cp_kv_cache_interleave_size})."
        )

    # Mamba cache align-mode constraints
    if self.cache_config.mamba_cache_mode == "align":
        assert block_size <= self.scheduler_config.max_num_batched_tokens, (
            "In Mamba cache align mode, block_size "
            f"({block_size}) must be <= "
            "max_num_batched_tokens "
            f"({self.scheduler_config.max_num_batched_tokens})."
        )
        if self.scheduler_config.long_prefill_token_threshold > 0:
            assert self.scheduler_config.long_prefill_token_threshold >= block_size
        assert not self.scheduler_config.disable_chunked_mm_input, (
            "Chunked MM input is required because we need the flexibility "
            "to schedule a multiple of block_size tokens even if they are "
            "in the middle of a mm input"
        )

WeightTransferConfig

Configuration for weight transfer during RL training.

Attributes:

  • backend (Literal['nccl', 'ipc'] | str) –

    The backend to use for weight transfer. Validated against the

Source code in vllm/config/weight_transfer.py
@config
class WeightTransferConfig:
    """Configuration for weight transfer during RL training."""

    backend: Literal["nccl", "ipc"] | str = "nccl"
    """The backend to use for weight transfer. Validated against the
    `WeightTransferEngineFactory` registry at engine creation time.
    """

backend = 'nccl' class-attribute instance-attribute

The backend to use for weight transfer. Validated against the WeightTransferEngineFactory registry at engine creation time.

config(cls=None, *, config=None, **kwargs)

config(cls: type[ConfigT]) -> type[ConfigT]
config(
    *, config: ConfigDict | None = None, **kwargs: Any
) -> Callable[[type[ConfigT]], type[ConfigT]]

Decorator to create a pydantic dataclass with default config. The default config for the dataclass forbids extra fields.

All config classes in vLLM should use this decorator.

Parameters:

  • cls

    (type[ConfigT] | None, default: None ) –

    The class to decorate

  • config

    (ConfigDict | None, default: None ) –

    The pydantic ConfigDict to use. If provided, it will be merged with the default config.

  • **kwargs

    (Any, default: {} ) –

    Additional arguments to pass to pydantic.dataclass.

Source code in vllm/config/utils.py
@dataclass_transform(field_specifiers=(PydanticField,))
def config(
    cls: type[ConfigT] | None = None,
    *,
    config: ConfigDict | None = None,
    **kwargs: Any,
) -> type[ConfigT] | Callable[[type[ConfigT]], type[ConfigT]]:
    """Decorator to create a pydantic dataclass with default config. The default config
    for the dataclass forbids extra fields.

    All config classes in vLLM should use this decorator.

    Args:
        cls: The class to decorate
        config: The pydantic ConfigDict to use. If provided, it will be merged with
            the default config.
        **kwargs: Additional arguments to pass to pydantic.dataclass."""
    # Extra fields are forbidden by default
    merged_config = ConfigDict(extra="forbid")
    if config is not None:
        merged_config.update(config)

    def decorator(cls: type[ConfigT]) -> type[ConfigT]:
        return dataclass(cls, config=merged_config, **kwargs)  # type: ignore[return-value]

    # Called with arguments: @config(config=...)
    if cls is None:
        return decorator
    # Called without arguments: @config
    return decorator(cls)

get_attr_docs(cls)

Get any docstrings placed after attribute assignments in a class body.

https://davidism.com/mit-license/

Source code in vllm/config/utils.py
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
    """
    Get any docstrings placed after attribute assignments in a class body.

    https://davidism.com/mit-license/
    """

    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]

    if not isinstance(cls_node, ast.ClassDef):
        raise TypeError("Given object was not a class.")

    out = {}

    # Consider each pair of nodes.
    for a, b in pairwise(cls_node.body):
        # Must be an assignment then a constant string.
        if (
            not isinstance(a, (ast.Assign, ast.AnnAssign))
            or not isinstance(b, ast.Expr)
            or not isinstance(b.value, ast.Constant)
            or not isinstance(b.value.value, str)
        ):
            continue

        doc = inspect.cleandoc(b.value.value)

        # An assignment can have multiple targets (a = b = v), but an
        # annotated assignment only has one target.
        targets = a.targets if isinstance(a, ast.Assign) else [a.target]

        for target in targets:
            # Must be assigning to a plain name.
            if not isinstance(target, ast.Name):
                continue

            out[target.id] = doc

    return out

get_cached_compilation_config() cached

Cache config to avoid repeated calls to get_current_vllm_config()

Source code in vllm/config/vllm.py
@lru_cache(maxsize=1)
def get_cached_compilation_config():
    """Cache config to avoid repeated calls to get_current_vllm_config()"""
    return get_current_vllm_config().compilation_config

get_layers_from_vllm_config(vllm_config, layer_type, layer_names=None)

Get layers from the vLLM config.

Parameters:

  • vllm_config

    (VllmConfig) –

    The vLLM config.

  • layer_type

    (type[T]) –

    The type of the layer to get.

  • layer_names

    (Iterable[str] | None, default: None ) –

    The names of the layers to get. If None, return all layers.

Source code in vllm/config/vllm.py
def get_layers_from_vllm_config(
    vllm_config: VllmConfig,
    layer_type: type[T],
    layer_names: Iterable[str] | None = None,
) -> dict[str, T]:
    """
    Get layers from the vLLM config.

    Args:
        vllm_config: The vLLM config.
        layer_type: The type of the layer to get.
        layer_names: The names of the layers to get. If None, return all layers.
    """

    forward_context = vllm_config.compilation_config.static_forward_context
    if layer_names is None:
        layer_names = forward_context.keys()

    return {
        layer_name: layer
        for layer_name in layer_names
        if isinstance(layer := forward_context.get(layer_name), layer_type)
    }

replace(dataclass_instance, /, **kwargs)

Like dataclasses.replace, but compatible with Pydantic dataclasses which use pydantic.fields.Field instead of dataclasses.field

Source code in vllm/config/utils.py
def replace(dataclass_instance: ConfigT, /, **kwargs) -> ConfigT:
    """Like [`dataclasses.replace`](https://docs.python.org/3/library/dataclasses.html#dataclasses.replace),
    but compatible with Pydantic dataclasses which use `pydantic.fields.Field` instead
    of `dataclasses.field`"""
    cls = type(dataclass_instance)
    dataclass_dict = dataclass_instance.__dict__
    dataclass_dict = {k: v for k, v in dataclass_dict.items() if is_init_field(cls, k)}
    dataclass_dict.update(kwargs)
    return cls(**dataclass_dict)

set_current_vllm_config(vllm_config, check_compile=False, prefix=None)

Temporarily set the current vLLM config. Used during model initialization. We save the current vLLM config in a global variable, so that all modules can access it, e.g. custom ops can access the vLLM config to determine how to dispatch.

Source code in vllm/config/vllm.py
@contextmanager
def set_current_vllm_config(
    vllm_config: VllmConfig, check_compile=False, prefix: str | None = None
):
    """
    Temporarily set the current vLLM config.
    Used during model initialization.
    We save the current vLLM config in a global variable,
    so that all modules can access it, e.g. custom ops
    can access the vLLM config to determine how to dispatch.
    """
    global _current_vllm_config, _current_prefix
    old_vllm_config = _current_vllm_config
    old_prefix = _current_prefix
    from vllm.compilation.counter import compilation_counter

    num_models_seen = compilation_counter.num_models_seen
    try:
        # Clear the compilation config cache when context changes.
        # This is needed since the old config may have been accessed
        # and cached before the new config is set.
        get_cached_compilation_config.cache_clear()

        _current_vllm_config = vllm_config
        _current_prefix = prefix
        yield
    except Exception:
        raise
    else:
        if check_compile:
            vllm_config.compilation_config.custom_op_log_check()

        if (
            check_compile
            and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
            and compilation_counter.num_models_seen == num_models_seen
        ):
            # If the model supports compilation,
            # compilation_counter.num_models_seen should be increased
            # by at least 1.
            # If it is not increased, it means the model does not support
            # compilation (does not have @support_torch_compile decorator).
            logger.warning(
                "`torch.compile` is turned on, but the model %s"
                " does not support it. Please open an issue on GitHub"
                " if you want it to be supported.",
                vllm_config.model_config.model,
            )
    finally:
        _current_vllm_config = old_vllm_config
        _current_prefix = old_prefix
        # Clear the compilation config cache when context changes
        get_cached_compilation_config.cache_clear()