Skip to content

vllm.model_executor.models.config

Classes:

ColQwen3_5Config

Bases: Qwen3_5ForConditionalGenerationConfig

ColQwen3.5 (late-interaction retrieval) inherits Qwen3.5's mamba cache handling and additionally serves BIDIRECTIONAL attention: ColPali-style document/query encoding attends over the whole sequence, not causally. Set is_causal=False so Qwen3NextAttention builds its full_attention layers with AttentionType.ENCODER_ONLY (the linear_attention GatedDeltaNet layers are unaffected). Generation arches keep the parent (causal) and are untouched.

Source code in vllm/model_executor/models/config.py
class ColQwen3_5Config(Qwen3_5ForConditionalGenerationConfig):
    """ColQwen3.5 (late-interaction retrieval) inherits Qwen3.5's mamba cache
    handling and additionally serves BIDIRECTIONAL attention: ColPali-style
    document/query encoding attends over the whole sequence, not causally. Set
    is_causal=False so Qwen3NextAttention builds its full_attention layers with
    AttentionType.ENCODER_ONLY (the linear_attention GatedDeltaNet layers are
    unaffected). Generation arches keep the parent (causal) and are untouched.
    """

    @staticmethod
    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
        model_config.hf_config.is_causal = False

DiffusionGemmaModelForBlockDiffusionConfig

Bases: VerifyAndUpdateConfig

Methods:

Source code in vllm/model_executor/models/config.py
class DiffusionGemmaModelForBlockDiffusionConfig(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """Set up the diffusion config and defaults for DiffusionGemma.

        Auto-creates DiffusionConfig from the HF config when the user
        didn't pass ``--diffusion-config``. Diffusion sampling params are
        read straight from generation_config.json at sampler-build time
        (see DiffusionGemma's custom_sampler), not injected here.
        """
        # Inherit Gemma4's attention backend selection (FA4 on Hopper,
        # TRITON_ATTN fallback for heterogeneous head dims).
        Gemma4Config.verify_and_update_config(vllm_config)

        from vllm.v1.attention.backends.registry import AttentionBackendEnum

        attention_config = vllm_config.attention_config
        if attention_config.backend == AttentionBackendEnum.FLASHINFER:
            raise ValueError(
                "FlashInfer does not support DiffusionGemma's mixed "
                "causal/bidirectional attention. Use --attention-backend "
                "FLASH_ATTN or TRITON_ATTN instead."
            )
        if attention_config.backend is None and not attention_config.use_non_causal:
            attention_config.use_non_causal = True
            logger.info(
                "DiffusionGemma uses mixed causal/bidirectional attention "
                "within a batch; setting use_non_causal=True to exclude "
                "FlashInfer from auto-selection."
            )

        # Auto-create DiffusionConfig from HF config if not provided.
        if vllm_config.diffusion_config is None:
            from vllm.config.diffusion import DiffusionConfig

            hf_config = vllm_config.model_config.hf_config
            canvas_length = getattr(hf_config, "canvas_length", 256)
            vllm_config.diffusion_config = DiffusionConfig(
                canvas_length=canvas_length,
            )

        # The diffusion sampler materializes [num_seqs, canvas_length, vocab]
        # fp32 transients, so concurrency is memory-bound (>8 OOMs a single H200).
        # Default to 8 when the user didn't pass --max-num-seqs.
        # We can't see the original None here (the engine already filled a generic
        # default), so use >= DEFAULT_MAX_NUM_SEQS as a proxy, (the default is much
        # larger than any deliberate value for this model)
        from vllm.config.scheduler import SchedulerConfig

        sc = vllm_config.scheduler_config
        if sc is not None and sc.max_num_seqs >= SchedulerConfig.DEFAULT_MAX_NUM_SEQS:
            sc.max_num_seqs = 8

        # Remove the model's generation_config.json cap on max_new_tokens
        # (256) so DiffusionGemma behaves like every other model: no
        # server-wide limit, each request controls its own output length
        # via max_tokens.  Setting to None causes get_diff_sampling_param
        # to skip this key entirely.
        model_config = vllm_config.model_config
        if "max_new_tokens" not in model_config.override_generation_config:
            model_config.override_generation_config["max_new_tokens"] = None
            logger.info(
                "DiffusionGemma: removing server-wide max_new_tokens cap "
                "from generation_config.json (use "
                "--override-generation-config to set a custom limit).",
            )

verify_and_update_config(vllm_config) classmethod

Set up the diffusion config and defaults for DiffusionGemma.

Auto-creates DiffusionConfig from the HF config when the user didn't pass --diffusion-config. Diffusion sampling params are read straight from generation_config.json at sampler-build time (see DiffusionGemma's custom_sampler), not injected here.

Source code in vllm/model_executor/models/config.py
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
    """Set up the diffusion config and defaults for DiffusionGemma.

    Auto-creates DiffusionConfig from the HF config when the user
    didn't pass ``--diffusion-config``. Diffusion sampling params are
    read straight from generation_config.json at sampler-build time
    (see DiffusionGemma's custom_sampler), not injected here.
    """
    # Inherit Gemma4's attention backend selection (FA4 on Hopper,
    # TRITON_ATTN fallback for heterogeneous head dims).
    Gemma4Config.verify_and_update_config(vllm_config)

    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    attention_config = vllm_config.attention_config
    if attention_config.backend == AttentionBackendEnum.FLASHINFER:
        raise ValueError(
            "FlashInfer does not support DiffusionGemma's mixed "
            "causal/bidirectional attention. Use --attention-backend "
            "FLASH_ATTN or TRITON_ATTN instead."
        )
    if attention_config.backend is None and not attention_config.use_non_causal:
        attention_config.use_non_causal = True
        logger.info(
            "DiffusionGemma uses mixed causal/bidirectional attention "
            "within a batch; setting use_non_causal=True to exclude "
            "FlashInfer from auto-selection."
        )

    # Auto-create DiffusionConfig from HF config if not provided.
    if vllm_config.diffusion_config is None:
        from vllm.config.diffusion import DiffusionConfig

        hf_config = vllm_config.model_config.hf_config
        canvas_length = getattr(hf_config, "canvas_length", 256)
        vllm_config.diffusion_config = DiffusionConfig(
            canvas_length=canvas_length,
        )

    # The diffusion sampler materializes [num_seqs, canvas_length, vocab]
    # fp32 transients, so concurrency is memory-bound (>8 OOMs a single H200).
    # Default to 8 when the user didn't pass --max-num-seqs.
    # We can't see the original None here (the engine already filled a generic
    # default), so use >= DEFAULT_MAX_NUM_SEQS as a proxy, (the default is much
    # larger than any deliberate value for this model)
    from vllm.config.scheduler import SchedulerConfig

    sc = vllm_config.scheduler_config
    if sc is not None and sc.max_num_seqs >= SchedulerConfig.DEFAULT_MAX_NUM_SEQS:
        sc.max_num_seqs = 8

    # Remove the model's generation_config.json cap on max_new_tokens
    # (256) so DiffusionGemma behaves like every other model: no
    # server-wide limit, each request controls its own output length
    # via max_tokens.  Setting to None causes get_diff_sampling_param
    # to skip this key entirely.
    model_config = vllm_config.model_config
    if "max_new_tokens" not in model_config.override_generation_config:
        model_config.override_generation_config["max_new_tokens"] = None
        logger.info(
            "DiffusionGemma: removing server-wide max_new_tokens cap "
            "from generation_config.json (use "
            "--override-generation-config to set a custom limit).",
        )

Gemma4Config

Bases: VerifyAndUpdateConfig

Methods:

Source code in vllm/model_executor/models/config.py
class Gemma4Config(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        """Configure attention for heterogeneous head dimensions.

        Gemma4 uses different head dimensions for sliding window
        (head_dim) vs full attention (global_head_dim) layers. The
        default FA3 on Hopper cannot handle head_dim > 256, which
        causes mixed backend selection and numerical divergence.

        When FA4 is available we force it for ALL layers, giving a
        uniform kernel path and avoiding the mixed FA3+FA4 penalty.
        When FA4 is not available we fall back to Triton.
        """
        hf_text_config = vllm_config.model_config.hf_text_config
        head_dim = getattr(hf_text_config, "head_dim", None)
        global_head_dim = getattr(hf_text_config, "global_head_dim", None)

        if head_dim is None or global_head_dim is None or head_dim == global_head_dim:
            return

        from vllm.v1.attention.backends.fa_utils import is_fa_version_supported
        from vllm.v1.attention.backends.registry import AttentionBackendEnum

        max_head_dim = max(head_dim, global_head_dim)

        if is_fa_version_supported(4) and max_head_dim <= 512:
            if (
                vllm_config.attention_config.flash_attn_version is None
                and vllm_config.attention_config.backend
                in (None, AttentionBackendEnum.FLASH_ATTN)
            ):
                vllm_config.attention_config.flash_attn_version = 4
                logger.info(
                    "Gemma4 model has heterogeneous head dimensions "
                    "(head_dim=%d, global_head_dim=%d). Using FA4 for "
                    "all layers to avoid mixed FA3/FA4 penalty.",
                    head_dim,
                    global_head_dim,
                )
        elif vllm_config.attention_config.backend is None:
            vllm_config.attention_config.backend = AttentionBackendEnum.TRITON_ATTN
            logger.info(
                "Gemma4 model has heterogeneous head dimensions "
                "(head_dim=%d, global_head_dim=%d). FA4 not available, "
                "forcing TRITON_ATTN backend.",
                head_dim,
                global_head_dim,
            )

verify_and_update_config(vllm_config) staticmethod

Configure attention for heterogeneous head dimensions.

Gemma4 uses different head dimensions for sliding window (head_dim) vs full attention (global_head_dim) layers. The default FA3 on Hopper cannot handle head_dim > 256, which causes mixed backend selection and numerical divergence.

When FA4 is available we force it for ALL layers, giving a uniform kernel path and avoiding the mixed FA3+FA4 penalty. When FA4 is not available we fall back to Triton.

Source code in vllm/model_executor/models/config.py
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
    """Configure attention for heterogeneous head dimensions.

    Gemma4 uses different head dimensions for sliding window
    (head_dim) vs full attention (global_head_dim) layers. The
    default FA3 on Hopper cannot handle head_dim > 256, which
    causes mixed backend selection and numerical divergence.

    When FA4 is available we force it for ALL layers, giving a
    uniform kernel path and avoiding the mixed FA3+FA4 penalty.
    When FA4 is not available we fall back to Triton.
    """
    hf_text_config = vllm_config.model_config.hf_text_config
    head_dim = getattr(hf_text_config, "head_dim", None)
    global_head_dim = getattr(hf_text_config, "global_head_dim", None)

    if head_dim is None or global_head_dim is None or head_dim == global_head_dim:
        return

    from vllm.v1.attention.backends.fa_utils import is_fa_version_supported
    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    max_head_dim = max(head_dim, global_head_dim)

    if is_fa_version_supported(4) and max_head_dim <= 512:
        if (
            vllm_config.attention_config.flash_attn_version is None
            and vllm_config.attention_config.backend
            in (None, AttentionBackendEnum.FLASH_ATTN)
        ):
            vllm_config.attention_config.flash_attn_version = 4
            logger.info(
                "Gemma4 model has heterogeneous head dimensions "
                "(head_dim=%d, global_head_dim=%d). Using FA4 for "
                "all layers to avoid mixed FA3/FA4 penalty.",
                head_dim,
                global_head_dim,
            )
    elif vllm_config.attention_config.backend is None:
        vllm_config.attention_config.backend = AttentionBackendEnum.TRITON_ATTN
        logger.info(
            "Gemma4 model has heterogeneous head dimensions "
            "(head_dim=%d, global_head_dim=%d). FA4 not available, "
            "forcing TRITON_ATTN backend.",
            head_dim,
            global_head_dim,
        )

HybridAttentionMambaModelConfig

Bases: VerifyAndUpdateConfig

Methods:

Source code in vllm/model_executor/models/config.py
class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        Perform early validation and setup for hybrid attention/mamba models.

        Block size alignment with mamba page sizes is handled later by
        Platform.update_block_size_for_backend(), which runs after model
        layers are constructed and the attention backend is known.

        Args:
            vllm_config: vLLM Config
        """
        cache_config = vllm_config.cache_config

        # Disable calculate_kv_scales for hybrid models: uninitialized
        # recurrent state corrupts scales during the calibration pass.
        # See issue: https://github.com/vllm-project/vllm/issues/37554

        if cache_config.calculate_kv_scales:
            logger.warning(
                "Disabling calculate_kv_scales for hybrid model '%s'. "
                "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
                "produce unreliable KV cache scales during the "
                "calibration pass because recurrent state is "
                "uninitialized. Using default scale of 1.0 instead.",
                vllm_config.model_config.model,
            )
            cache_config.calculate_kv_scales = False

        # Enable FULL_AND_PIECEWISE by default
        MambaModelConfig.verify_and_update_config(vllm_config)

verify_and_update_config(vllm_config) classmethod

Perform early validation and setup for hybrid attention/mamba models.

Block size alignment with mamba page sizes is handled later by Platform.update_block_size_for_backend(), which runs after model layers are constructed and the attention backend is known.

Parameters:

Source code in vllm/model_executor/models/config.py
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
    """
    Perform early validation and setup for hybrid attention/mamba models.

    Block size alignment with mamba page sizes is handled later by
    Platform.update_block_size_for_backend(), which runs after model
    layers are constructed and the attention backend is known.

    Args:
        vllm_config: vLLM Config
    """
    cache_config = vllm_config.cache_config

    # Disable calculate_kv_scales for hybrid models: uninitialized
    # recurrent state corrupts scales during the calibration pass.
    # See issue: https://github.com/vllm-project/vllm/issues/37554

    if cache_config.calculate_kv_scales:
        logger.warning(
            "Disabling calculate_kv_scales for hybrid model '%s'. "
            "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
            "produce unreliable KV cache scales during the "
            "calibration pass because recurrent state is "
            "uninitialized. Using default scale of 1.0 instead.",
            vllm_config.model_config.model,
        )
        cache_config.calculate_kv_scales = False

    # Enable FULL_AND_PIECEWISE by default
    MambaModelConfig.verify_and_update_config(vllm_config)

LlamaNemotronVLConfig

Bases: VerifyAndUpdateConfig

Config handler for LlamaNemotronVL embedding models.

Source code in vllm/model_executor/models/config.py
class LlamaNemotronVLConfig(VerifyAndUpdateConfig):
    """Config handler for LlamaNemotronVL embedding models."""

    @staticmethod
    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
        from vllm.config.pooler import SequencePoolingType

        hf_config = model_config.hf_config

        # Set bidirectional attention on the language model config
        hf_config.is_causal = False
        if hasattr(hf_config, "llm_config"):
            hf_config.llm_config.is_causal = False

        if hasattr(hf_config, "vision_config"):
            hf_config.patch_size = hf_config.vision_config.patch_size

        # Set up pooling type
        pooling_type_map: dict[str, SequencePoolingType] = {
            "avg": "MEAN",
            "cls": "CLS",
            "last": "LAST",
        }

        # Get pooling type from config (check both top-level and llm_config)
        pooling = getattr(hf_config, "pooling", None)
        if pooling is None and hasattr(hf_config, "llm_config"):
            pooling = getattr(hf_config.llm_config, "pooling", "avg")

        pooling_type = pooling_type_map.get(pooling)
        if pooling_type is None:
            raise ValueError(f"pool_type {pooling!r} not supported")

        model_config.pooler_config.seq_pooling_type = pooling_type

MambaModelConfig

Bases: VerifyAndUpdateConfig

Methods:

Source code in vllm/model_executor/models/config.py
class MambaModelConfig(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
        to get good performance for mamba layers in V1).

        Args:
            vllm_config: vLLM Config
        """
        model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config

        if cache_config.enable_prefix_caching:
            if cache_config.mamba_cache_mode == "none":
                cache_config.mamba_cache_mode = (
                    "all" if model_config.supports_mamba_prefix_caching else "align"
                )
                logger.warning(
                    "Mamba cache mode is set to '%s' for %s by default "
                    "when prefix caching is enabled",
                    cache_config.mamba_cache_mode,
                    model_config.architecture,
                )
            if (
                cache_config.mamba_cache_mode == "all"
                and not model_config.supports_mamba_prefix_caching
            ):
                cache_config.mamba_cache_mode = "align"
                logger.warning(
                    "Hybrid or mamba-based model detected without support "
                    "for prefix caching with Mamba cache 'all' mode: "
                    "falling back to 'align' mode."
                )
            if cache_config.mamba_cache_mode == "align":
                assert vllm_config.scheduler_config.enable_chunked_prefill, (
                    "Chunked prefill is required for mamba cache mode 'align'."
                )
            logger.info(
                "Warning: Prefix caching in Mamba cache '%s' "
                "mode is currently enabled. "
                "Its support for Mamba layers is experimental. "
                "Please report any issues you may observe.",
                cache_config.mamba_cache_mode,
            )
            # By default, mamba block size will be set to max_model_len (see
            # below). When enabling prefix caching, we align mamba block size
            # to the block size as the basic granularity for prefix caching.
            if cache_config.mamba_block_size is None:
                cache_config.mamba_block_size = cache_config.block_size
        else:
            if cache_config.mamba_cache_mode != "none":
                cache_config.mamba_cache_mode = "none"
                logger.warning(
                    "Mamba cache mode is set to 'none' when prefix caching is disabled"
                )
            if cache_config.mamba_block_size is None:
                cache_config.mamba_block_size = model_config.max_model_len

verify_and_update_config(vllm_config) classmethod

Enable FULL_AND_PIECEWISE cuda graph mode by default (required to get good performance for mamba layers in V1).

Parameters:

Source code in vllm/model_executor/models/config.py
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
    """
    Enable FULL_AND_PIECEWISE cuda graph mode by default (required
    to get good performance for mamba layers in V1).

    Args:
        vllm_config: vLLM Config
    """
    model_config = vllm_config.model_config
    cache_config = vllm_config.cache_config

    if cache_config.enable_prefix_caching:
        if cache_config.mamba_cache_mode == "none":
            cache_config.mamba_cache_mode = (
                "all" if model_config.supports_mamba_prefix_caching else "align"
            )
            logger.warning(
                "Mamba cache mode is set to '%s' for %s by default "
                "when prefix caching is enabled",
                cache_config.mamba_cache_mode,
                model_config.architecture,
            )
        if (
            cache_config.mamba_cache_mode == "all"
            and not model_config.supports_mamba_prefix_caching
        ):
            cache_config.mamba_cache_mode = "align"
            logger.warning(
                "Hybrid or mamba-based model detected without support "
                "for prefix caching with Mamba cache 'all' mode: "
                "falling back to 'align' mode."
            )
        if cache_config.mamba_cache_mode == "align":
            assert vllm_config.scheduler_config.enable_chunked_prefill, (
                "Chunked prefill is required for mamba cache mode 'align'."
            )
        logger.info(
            "Warning: Prefix caching in Mamba cache '%s' "
            "mode is currently enabled. "
            "Its support for Mamba layers is experimental. "
            "Please report any issues you may observe.",
            cache_config.mamba_cache_mode,
        )
        # By default, mamba block size will be set to max_model_len (see
        # below). When enabling prefix caching, we align mamba block size
        # to the block size as the basic granularity for prefix caching.
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = cache_config.block_size
    else:
        if cache_config.mamba_cache_mode != "none":
            cache_config.mamba_cache_mode = "none"
            logger.warning(
                "Mamba cache mode is set to 'none' when prefix caching is disabled"
            )
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = model_config.max_model_len

NemotronHForCausalLMConfig

Bases: VerifyAndUpdateConfig

Methods:

Attributes:

Source code in vllm/model_executor/models/config.py
class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
    DEFAULT_MAMBA_SSM_CACHE_DTYPE = "float32"
    """Only `float32` is known to have no accuracy issues by default."""

    @classmethod
    def update_mamba_ssm_cache_dtype(
        cls, *, cache_config: "CacheConfig", hf_config: "PretrainedConfig"
    ) -> None:
        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
        (or not explicitly set), to the value specified in the HF config, or to
        `float32` if not specified.
        """
        if cache_config.mamba_ssm_cache_dtype == "auto":
            mamba_ssm_cache_dtype = getattr(
                hf_config, "mamba_ssm_cache_dtype", cls.DEFAULT_MAMBA_SSM_CACHE_DTYPE
            )
            logger.info(
                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
                mamba_ssm_cache_dtype,
            )
            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype

    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        cls.update_mamba_ssm_cache_dtype(
            cache_config=vllm_config.cache_config,
            hf_config=vllm_config.model_config.hf_config,
        )

DEFAULT_MAMBA_SSM_CACHE_DTYPE = 'float32' class-attribute instance-attribute

Only float32 is known to have no accuracy issues by default.

update_mamba_ssm_cache_dtype(*, cache_config, hf_config) classmethod

Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto' (or not explicitly set), to the value specified in the HF config, or to float32 if not specified.

Source code in vllm/model_executor/models/config.py
@classmethod
def update_mamba_ssm_cache_dtype(
    cls, *, cache_config: "CacheConfig", hf_config: "PretrainedConfig"
) -> None:
    """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
    (or not explicitly set), to the value specified in the HF config, or to
    `float32` if not specified.
    """
    if cache_config.mamba_ssm_cache_dtype == "auto":
        mamba_ssm_cache_dtype = getattr(
            hf_config, "mamba_ssm_cache_dtype", cls.DEFAULT_MAMBA_SSM_CACHE_DTYPE
        )
        logger.info(
            "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
            mamba_ssm_cache_dtype,
        )
        cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype

Qwen3_5ForConditionalGenerationConfig

Bases: VerifyAndUpdateConfig

Methods:

Source code in vllm/model_executor/models/config.py
class Qwen3_5ForConditionalGenerationConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        """Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto'
        (or not explicitly set), to the value specified in the HF config's
        mamba_ssm_dtype field. Warn if the user explicitly overrides it to a
        different value.
        """
        cache_config = vllm_config.cache_config
        hf_text_config = vllm_config.model_config.hf_text_config
        mamba_ssm_dtype = getattr(hf_text_config, "mamba_ssm_dtype", None)
        if cache_config.mamba_ssm_cache_dtype == "auto":
            if mamba_ssm_dtype is not None:
                cache_config.mamba_ssm_cache_dtype = mamba_ssm_dtype
        elif (
            mamba_ssm_dtype is not None
            and cache_config.mamba_ssm_cache_dtype != mamba_ssm_dtype
        ):
            logger.warning(
                "Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, "
                "but --mamba-ssm-cache-dtype='%s' was passed. "
                "Using the user-specified value.",
                mamba_ssm_dtype,
                cache_config.mamba_ssm_cache_dtype,
            )

verify_and_update_config(vllm_config) staticmethod

Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto' (or not explicitly set), to the value specified in the HF config's mamba_ssm_dtype field. Warn if the user explicitly overrides it to a different value.

Source code in vllm/model_executor/models/config.py
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
    """Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto'
    (or not explicitly set), to the value specified in the HF config's
    mamba_ssm_dtype field. Warn if the user explicitly overrides it to a
    different value.
    """
    cache_config = vllm_config.cache_config
    hf_text_config = vllm_config.model_config.hf_text_config
    mamba_ssm_dtype = getattr(hf_text_config, "mamba_ssm_dtype", None)
    if cache_config.mamba_ssm_cache_dtype == "auto":
        if mamba_ssm_dtype is not None:
            cache_config.mamba_ssm_cache_dtype = mamba_ssm_dtype
    elif (
        mamba_ssm_dtype is not None
        and cache_config.mamba_ssm_cache_dtype != mamba_ssm_dtype
    ):
        logger.warning(
            "Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, "
            "but --mamba-ssm-cache-dtype='%s' was passed. "
            "Using the user-specified value.",
            mamba_ssm_dtype,
            cache_config.mamba_ssm_cache_dtype,
        )

UnlimitedOCRForCausalLMConfig

Bases: VerifyAndUpdateConfig

Methods:

Source code in vllm/model_executor/models/config.py
class UnlimitedOCRForCausalLMConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        """Configure Unlimited-OCR attention backends for R-SWA and vision.

        Backend selection — controlled by the standard ``--attention-config``
        CLI argument (priority order):

          1. ``--attention-config '{"backend": "FLASH_ATTN"}'``
             → FA4 + rswa_mask_mod.  Exact token-level R-SWA.
               ``flash_attn_version`` is forced to 4 if not already set (R-SWA
               mask_mod requires FA4; FA3 cannot express it).  Raises if FA4 is
               not available on this device.

          2. ``--attention-config '{"backend": "FLEX_ATTENTION"}'``
             → FlexAttention R-SWA via Triton block mask.

          3. ``--attention-config '{"backend": "auto"}'`` (or omitted)
             → Auto-detect: FA4 if available (H20/H100 SM90), else FlexAttention.

        Regardless of backend, prefix caching is disabled for this model: R-SWA
        decode-phase KV is not a pure causal function of the prefix (so decode
        blocks are not reusable), and single-turn image-led OCR prompts rarely
        hit the prefix cache.

        Example — force FlexAttention even on a machine with FA4::

            vllm serve baidu/Unlimited-OCR \\
                --attention-config '{"backend": "FLEX_ATTENTION"}'
        """
        from vllm.v1.attention.backends.registry import AttentionBackendEnum
        from vllm.vllm_flash_attn import is_fa_version_supported

        attn_config = vllm_config.attention_config
        fa4_available = is_fa_version_supported(4)

        # ── step 1: resolve backend ─────────────────────────────────────────
        # None means the user did not explicitly specify a backend; auto-select.
        if attn_config.backend is None:
            attn_config.backend = (
                AttentionBackendEnum.FLASH_ATTN
                if fa4_available
                else AttentionBackendEnum.FLEX_ATTENTION
            )
            logger.info(
                "Unlimited-OCR: auto-selected attention backend=%s (fa4_available=%s).",
                attn_config.backend.value,
                fa4_available,
            )

        # ── step 2: configure the chosen backend ────────────────────────────
        if attn_config.backend == AttentionBackendEnum.FLASH_ATTN:
            if not fa4_available:
                raise RuntimeError(
                    "Unlimited-OCR: --attention-config backend=FLASH_ATTN "
                    "requires FA4 (rswa_mask_mod), but FA4 is not available on "
                    "this device/installation.  Use backend=FLEX_ATTENTION or "
                    "upgrade vllm-flash-attn."
                )
            # On SM90 (H20), the default FA version is FA3 regardless of FA4
            # availability (FA4 is only auto-upgraded when head_size > 256).
            # The R-SWA mask_mod requires FA4, so force the version globally.
            if attn_config.flash_attn_version is None:
                attn_config.flash_attn_version = 4
            elif attn_config.flash_attn_version < 4:
                logger.warning(
                    "Unlimited-OCR: flash_attn_version=%d cannot express the "
                    "R-SWA mask_mod; upgrading to 4.",
                    attn_config.flash_attn_version,
                )
                attn_config.flash_attn_version = 4
            logger.info(
                "Unlimited-OCR: FlashAttention FA%d + rswa_mask_mod — exact R-SWA.",
                attn_config.flash_attn_version,
            )

        elif attn_config.backend == AttentionBackendEnum.FLEX_ATTENTION:
            logger.info(
                "Unlimited-OCR: FlexAttention — R-SWA via Triton block mask%s.",
                ""
                if not fa4_available
                else (
                    " (FA4 available but not used; pass backend=FLASH_ATTN to upgrade)"
                ),
            )

        else:
            raise ValueError(
                f"Unlimited-OCR: unsupported attention backend "
                f"{attn_config.backend!r} for R-SWA. "
                "Use FLASH_ATTN (FA4) or FLEX_ATTENTION."
            )

        # R-SWA windows the *generated* tokens, so a decode-token's KV is not a
        # pure causal function of the prefix and cannot be safely reused across
        # requests via prefix caching. Only the prompt/image prefix is cacheable,
        # but OCR is single-turn with image-led prompts that rarely share a
        # prefix, so prefix caching brings little benefit while complicating the
        # KV cache manager. Disable it for this model.
        cache_config = vllm_config.cache_config
        if cache_config.enable_prefix_caching:
            cache_config.enable_prefix_caching = False
            logger.info(
                "Unlimited-OCR: disabling prefix caching (R-SWA decode KV is not "
                "cacheable, and single-turn image-led prompts rarely hit the "
                "prefix cache)."
            )

        mm_config = getattr(vllm_config.model_config, "multimodal_config", None)
        if mm_config is not None:
            if mm_config.mm_encoder_attn_backend is None:
                mm_config.mm_encoder_attn_backend = AttentionBackendEnum.FLASH_ATTN
            elif mm_config.mm_encoder_attn_backend == AttentionBackendEnum.FLASHINFER:
                logger.warning(
                    "Unlimited-OCR: FlashInfer is not supported for the vision "
                    "encoder (the CLIP stage runs full attention without "
                    "cu_seqlens); falling back to FlashAttention."
                )
                mm_config.mm_encoder_attn_backend = AttentionBackendEnum.FLASH_ATTN

    @staticmethod
    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
        text_config = model_config.hf_config.text_config
        text_config.architectures = ["DeepseekV2ForCausalLM"]
        if getattr(model_config.hf_config, "rswa_window", None) is None:
            model_config.hf_config.rswa_window = 128
        # Propagate rswa_window to text_config so that DeepseekAttention (which
        # receives text_config as its vllm_config.model_config.hf_config via
        # init_vllm_registered_model) can read it and create RSWAAttention.
        rswa_window = model_config.hf_config.rswa_window
        text_config.rswa_window = rswa_window

verify_and_update_config(vllm_config) staticmethod

Configure Unlimited-OCR attention backends for R-SWA and vision.

Backend selection — controlled by the standard --attention-config CLI argument (priority order):

  1. --attention-config '{"backend": "FLASH_ATTN"}' → FA4 + rswa_mask_mod. Exact token-level R-SWA. flash_attn_version is forced to 4 if not already set (R-SWA mask_mod requires FA4; FA3 cannot express it). Raises if FA4 is not available on this device.

  2. --attention-config '{"backend": "FLEX_ATTENTION"}' → FlexAttention R-SWA via Triton block mask.

  3. --attention-config '{"backend": "auto"}' (or omitted) → Auto-detect: FA4 if available (H20/H100 SM90), else FlexAttention.

Regardless of backend, prefix caching is disabled for this model: R-SWA decode-phase KV is not a pure causal function of the prefix (so decode blocks are not reusable), and single-turn image-led OCR prompts rarely hit the prefix cache.

Example — force FlexAttention even on a machine with FA4::

vllm serve baidu/Unlimited-OCR \
    --attention-config '{"backend": "FLEX_ATTENTION"}'
Source code in vllm/model_executor/models/config.py
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
    """Configure Unlimited-OCR attention backends for R-SWA and vision.

    Backend selection — controlled by the standard ``--attention-config``
    CLI argument (priority order):

      1. ``--attention-config '{"backend": "FLASH_ATTN"}'``
         → FA4 + rswa_mask_mod.  Exact token-level R-SWA.
           ``flash_attn_version`` is forced to 4 if not already set (R-SWA
           mask_mod requires FA4; FA3 cannot express it).  Raises if FA4 is
           not available on this device.

      2. ``--attention-config '{"backend": "FLEX_ATTENTION"}'``
         → FlexAttention R-SWA via Triton block mask.

      3. ``--attention-config '{"backend": "auto"}'`` (or omitted)
         → Auto-detect: FA4 if available (H20/H100 SM90), else FlexAttention.

    Regardless of backend, prefix caching is disabled for this model: R-SWA
    decode-phase KV is not a pure causal function of the prefix (so decode
    blocks are not reusable), and single-turn image-led OCR prompts rarely
    hit the prefix cache.

    Example — force FlexAttention even on a machine with FA4::

        vllm serve baidu/Unlimited-OCR \\
            --attention-config '{"backend": "FLEX_ATTENTION"}'
    """
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    from vllm.vllm_flash_attn import is_fa_version_supported

    attn_config = vllm_config.attention_config
    fa4_available = is_fa_version_supported(4)

    # ── step 1: resolve backend ─────────────────────────────────────────
    # None means the user did not explicitly specify a backend; auto-select.
    if attn_config.backend is None:
        attn_config.backend = (
            AttentionBackendEnum.FLASH_ATTN
            if fa4_available
            else AttentionBackendEnum.FLEX_ATTENTION
        )
        logger.info(
            "Unlimited-OCR: auto-selected attention backend=%s (fa4_available=%s).",
            attn_config.backend.value,
            fa4_available,
        )

    # ── step 2: configure the chosen backend ────────────────────────────
    if attn_config.backend == AttentionBackendEnum.FLASH_ATTN:
        if not fa4_available:
            raise RuntimeError(
                "Unlimited-OCR: --attention-config backend=FLASH_ATTN "
                "requires FA4 (rswa_mask_mod), but FA4 is not available on "
                "this device/installation.  Use backend=FLEX_ATTENTION or "
                "upgrade vllm-flash-attn."
            )
        # On SM90 (H20), the default FA version is FA3 regardless of FA4
        # availability (FA4 is only auto-upgraded when head_size > 256).
        # The R-SWA mask_mod requires FA4, so force the version globally.
        if attn_config.flash_attn_version is None:
            attn_config.flash_attn_version = 4
        elif attn_config.flash_attn_version < 4:
            logger.warning(
                "Unlimited-OCR: flash_attn_version=%d cannot express the "
                "R-SWA mask_mod; upgrading to 4.",
                attn_config.flash_attn_version,
            )
            attn_config.flash_attn_version = 4
        logger.info(
            "Unlimited-OCR: FlashAttention FA%d + rswa_mask_mod — exact R-SWA.",
            attn_config.flash_attn_version,
        )

    elif attn_config.backend == AttentionBackendEnum.FLEX_ATTENTION:
        logger.info(
            "Unlimited-OCR: FlexAttention — R-SWA via Triton block mask%s.",
            ""
            if not fa4_available
            else (
                " (FA4 available but not used; pass backend=FLASH_ATTN to upgrade)"
            ),
        )

    else:
        raise ValueError(
            f"Unlimited-OCR: unsupported attention backend "
            f"{attn_config.backend!r} for R-SWA. "
            "Use FLASH_ATTN (FA4) or FLEX_ATTENTION."
        )

    # R-SWA windows the *generated* tokens, so a decode-token's KV is not a
    # pure causal function of the prefix and cannot be safely reused across
    # requests via prefix caching. Only the prompt/image prefix is cacheable,
    # but OCR is single-turn with image-led prompts that rarely share a
    # prefix, so prefix caching brings little benefit while complicating the
    # KV cache manager. Disable it for this model.
    cache_config = vllm_config.cache_config
    if cache_config.enable_prefix_caching:
        cache_config.enable_prefix_caching = False
        logger.info(
            "Unlimited-OCR: disabling prefix caching (R-SWA decode KV is not "
            "cacheable, and single-turn image-led prompts rarely hit the "
            "prefix cache)."
        )

    mm_config = getattr(vllm_config.model_config, "multimodal_config", None)
    if mm_config is not None:
        if mm_config.mm_encoder_attn_backend is None:
            mm_config.mm_encoder_attn_backend = AttentionBackendEnum.FLASH_ATTN
        elif mm_config.mm_encoder_attn_backend == AttentionBackendEnum.FLASHINFER:
            logger.warning(
                "Unlimited-OCR: FlashInfer is not supported for the vision "
                "encoder (the CLIP stage runs full attention without "
                "cu_seqlens); falling back to FlashAttention."
            )
            mm_config.mm_encoder_attn_backend = AttentionBackendEnum.FLASH_ATTN