vllm_gaudi.platform ¶

QWEN3_5_HYBRID_ARCHS `module-attribute` ¶

QWEN3_5_HYBRID_ARCHS = frozenset(
    {
        "Qwen3_5ForConditionalGeneration",
        "Qwen3_5MoeForConditionalGeneration",
    }
)

logger `module-attribute` ¶

logger = logger()

HpuPlatform ¶

Bases: Platform

Source code in vllm_gaudi/platform.py

class HpuPlatform(Platform):
    _enum = PlatformEnum.OOT
    device_name: str = "hpu"
    device_type: str = "hpu"
    dispatch_key: str = "HPU"
    ray_device_key: str = "HPU"
    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
    supported_quantization: list[str] = ["compressed-tensors", "fp8", "inc", "awq_hpu", "gptq_hpu", "modelopt"]
    simple_compile_backend = "hpu_backend"
    additional_env_vars = [k for k, v in os.environ.items() if retain_envs(k)]

    @classmethod
    def get_attn_backend_cls(
        cls,
        selected_backend: "AttentionBackendEnum",
        attn_selector_config: "AttentionSelectorConfig",
        num_heads: Optional[int] = None,
    ) -> str:
        from vllm.config import get_current_vllm_config
        from vllm.v1.attention.backends.registry import AttentionBackendEnum

        current_vllm_config = get_current_vllm_config()
        if current_vllm_config.device_config.device_type == "cpu":
            logger.info("Using CPU_ATTN backend for CPU-targeted config.")
            return AttentionBackendEnum.CPU_ATTN.get_path()

        if attn_selector_config.use_sparse:
            raise NotImplementedError("Sparse Attention is not supported on HPU.")

        if attn_selector_config.use_mla:
            logger.info("Using HPUAttentionMLA backend.")
            return ("vllm_gaudi.attention.backends.hpu_attn."
                    "HPUMLAAttentionBackend")

        logger.info("Using HPUAttentionV1 backend.")
        return ("vllm_gaudi.v1.attention.backends."
                "hpu_attn.HPUAttentionBackendV1")

    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        return True

    @classmethod
    def set_device(cls, device: torch.device) -> None:
        """
        Set the device for the current platform.
        """
        return

    @classmethod
    def manual_seed_all(cls, seed: int) -> None:
        torch.hpu.random.manual_seed_all(seed)

    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        return cls.device_name

    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        """Get the total memory of a device in bytes."""
        # NOTE: This is a workaround.
        # The correct implementation of the method in this place should look as follows:
        # total_hpu_memory = torch.hpu.mem_get_info()[1]
        # A value of 0 is returned to preserve the current logic in
        # vllm/vllm/engine/arg_utils.py → get_batch_defaults() →
        # default_max_num_batched_tokens, in order to avoid the
        # error in hpu_perf_test, while also preventing a
        # NotImplementedError in test_defaults_with_usage_context.
        logger.warning("This is a workaround! Please check the NOTE "
                       "in the get_device_total_memory definition.")

        total_hpu_memory = 0

        return total_hpu_memory

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        parallel_config = vllm_config.parallel_config

        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = \
                    "vllm_gaudi.v1.worker.hpu_worker.HPUWorker"

        # NOTE(kzawora): default block size for Gaudi should be 128
        # smaller sizes still work, but very inefficiently
        cache_config = vllm_config.cache_config
        if not cache_config.user_specified_block_size:
            cache_config.block_size = 128
        elif is_qwen3_5_hybrid_model(vllm_config.model_config) and cache_config.block_size != 128:
            # Narrow the reset to Qwen3.5 hybrids. Other hybrid models may
            # legitimately use a larger KV-manager block size and rely on
            # virtual block splitting down to 128-token HPU kernels.
            logger.info(
                "Resetting Qwen3.5 hybrid block_size from %d to 128 "
                "before Gaudi hybrid page-size realignment.",
                cache_config.block_size,
            )
            cache_config.block_size = 128
            if cache_config.mamba_cache_mode == "align":
                cache_config.mamba_block_size = 128
        # Hybrid GDN/Mamba models: upstream HybridAttentionMambaModelConfig
        # already ran and computed block_size / mamba_page_size_padded for
        # GPU.  HPU overrode block_size to 128 above, so we must re-align
        # mamba_page_size_padded to be a multiple of the HPU attention page
        # size (block_size * per-token KV bytes).  Without this the upstream
        # unify_kv_cache_spec_page_size() fails because the two page sizes
        # are not divisible.
        if (cache_config and cache_config.block_size is not None and vllm_config.model_config is not None
                and vllm_config.model_config.is_hybrid and cache_config.mamba_page_size_padded is not None):
            # Recompute mamba_page_size_padded so it is a multiple of
            # the HPU attention page size.
            from vllm.utils.torch_utils import get_dtype_size
            from math import ceil
            model_config = vllm_config.model_config
            if cache_config.cache_dtype == "auto":
                kv_dtype = model_config.dtype
            else:
                from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
                kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
            num_kv_heads = model_config.get_num_kv_heads(parallel_config)
            head_size = model_config.get_head_size()
            attn_page = (2 * cache_config.block_size * num_kv_heads * head_size * get_dtype_size(kv_dtype))
            if attn_page > 0 and cache_config.mamba_page_size_padded % attn_page != 0:
                old_padded = cache_config.mamba_page_size_padded
                cache_config.mamba_page_size_padded = (ceil(old_padded / attn_page) * attn_page)
                logger.info(
                    "Rescaled mamba_page_size_padded from %d to %d "
                    "to align with HPU attention page size %d "
                    "(block_size=%d).",
                    old_padded,
                    cache_config.mamba_page_size_padded,
                    attn_page,
                    cache_config.block_size,
                )
        if (parallel_config.distributed_executor_backend in ['mp', 'uni']
                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None:
                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                               "might cause application hangs on exit. Using "
                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
                               "as it was explicitly requested.")
            else:
                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                               "might cause application hangs on exit. Setting "
                               "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
                               "To override that behavior, please set "
                               "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

        if (vllm_config.model_config is not None and vllm_config.model_config.dtype in (torch.float16, torch.float32)):
            logger.warning("The HPU backend currently does not support %s. "
                           "Using bfloat16 instead.", vllm_config.model_config.dtype)
            vllm_config.model_config.dtype = torch.bfloat16

        from vllm.config import CompilationMode, CUDAGraphMode
        compilation_config = vllm_config.compilation_config
        # Activate custom ops for v1.
        compilation_config.custom_ops = ["all"]
        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
        compilation_config.cudagraph_capture_sizes = []

        if get_config().VLLM_CONTIGUOUS_PA:
            logger.warning("Using Contiguous PA, disabling prefix caching")
            vllm_config.cache_config.enable_prefix_caching = False

        if (vllm_config.cache_config.enable_prefix_caching and vllm_config.cache_config.mamba_cache_mode == "all"):
            vllm_config.cache_config.mamba_cache_mode = "align"
            logger.info("[HPU] Overriding mamba_cache_mode from 'all' to 'align' "
                        "to ensure block-aligned chunked prefill splits.")

        if (vllm_config.model_config is not None and vllm_config.model_config.is_hybrid):
            logger.debug(
                "[HPU] Hybrid model cache config: block_size=%s, "
                "mamba_block_size=%s, mamba_cache_mode=%s, "
                "enable_prefix_caching=%s", cache_config.block_size, getattr(cache_config, "mamba_block_size", None),
                getattr(cache_config, "mamba_cache_mode", None), cache_config.enable_prefix_caching)

        if compilation_config.mode != CompilationMode.NONE:
            logger.info("[HPU] Forcing CompilationMode.NONE "
                        "compilation mode")
            compilation_config.mode = CompilationMode.NONE

        # Force CPU loading for INC quantization to prevent OOM during weight loading.
        # INC FP8 quantization requires weights to be loaded to CPU first, then
        # quantized and moved to device. Without this, weights are loaded directly
        # to HPU in BF16 which causes OOM for large models.
        model_config = vllm_config.model_config
        is_inc_quant = (model_config is not None and model_config.quantization == "inc") or os.getenv("QUANT_CONFIG")
        if is_inc_quant and vllm_config.load_config is not None and vllm_config.load_config.device is None:
            logger.info("[HPU] INC quantization detected, loading weights to CPU first")
            vllm_config.load_config.device = "cpu"

        # Disable multi-stream for shared experts as no Stream on CPU
        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"

        # NOTE: vLLM has default enabled async scheduling with speculative decoding is on.
        # However, for HPU, speculative decoding is not supported with async scheduling.
        vllm_config.scheduler_config.async_scheduling = \
            vllm_config.scheduler_config.async_scheduling and vllm_config.speculative_config is None

    @classmethod
    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:

        cache_config = vllm_config.cache_config
        model_config = vllm_config.model_config

        # For Granite 4.0-H (granitemoehybrid), we compute the correct
        # block_size in this method using the PC-aware alignment formula
        # (528 without prefix caching, 768 with prefix caching).
        # We set block_size before calling super and mark it as
        # user-specified so Phase 1 preserves it; Phase 2
        # (_align_hybrid_block_size) then validates and sets
        # mamba_page_size_padded.
        is_granite_hybrid = (model_config is not None
                             and getattr(model_config.hf_config, "model_type", None) == "granitemoehybrid")
        if is_granite_hybrid:
            # Compute the correct block_size using the PC-aware formula.
            from vllm.utils.math_utils import cdiv
            from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
            from vllm.model_executor.models import ModelRegistry
            if cache_config.cache_dtype == "auto":
                kv_dtype = model_config.dtype
            else:
                from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
                kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
            attn_1tok = FullAttentionSpec(
                block_size=1,
                num_kv_heads=model_config.get_num_kv_heads(vllm_config.parallel_config),
                head_size=model_config.get_head_size(),
                dtype=kv_dtype,
            ).page_size_bytes
            model_cls, _ = ModelRegistry.resolve_model_cls(
                model_config.architecture,
                model_config=model_config,
            )
            mamba_page_size = MambaSpec(
                shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
                dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
                block_size=-1,
            ).page_size_bytes
            if mamba_page_size > 0:
                if cache_config.enable_prefix_caching:
                    mamba_chunk_size = getattr(model_config.hf_config, 'mamba_d_chunk', 256)
                    alignment = mamba_chunk_size
                else:
                    alignment = 16
                attn_block_size = alignment * cdiv(mamba_page_size, alignment * attn_1tok)
                cache_config.block_size = attn_block_size
                if cache_config.mamba_cache_mode == "align":
                    cache_config.mamba_block_size = attn_block_size
                logger.info(
                    "Setting granitemoehybrid block_size to %d tokens "
                    "(alignment=%d, mamba_page_size=%d bytes, "
                    "prefix_caching=%s).",
                    attn_block_size,
                    alignment,
                    mamba_page_size,
                    cache_config.enable_prefix_caching,
                )
            if not cache_config.user_specified_block_size:
                cache_config.user_specified_block_size = True
                super().update_block_size_for_backend(vllm_config)
                cache_config.user_specified_block_size = False
            else:
                super().update_block_size_for_backend(vllm_config)
        else:
            super().update_block_size_for_backend(vllm_config)

    @classmethod
    def is_pin_memory_available(cls):
        logger.warning("Pin memory is not supported on HPU.")
        return False

    @classmethod
    def get_punica_wrapper(cls) -> str:
        return "vllm_gaudi.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"

    @classmethod
    def support_hybrid_kv_cache(cls) -> bool:
        return True

    @classmethod
    def get_device_communicator_cls(cls) -> str:
        return "vllm_gaudi.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa

    @classmethod
    def supports_structured_output(cls) -> bool:
        return True

    @classmethod
    def supports_v1(cls, model_config: ModelConfig) -> bool:
        # V1 support on HPU is experimental
        return True

    @classmethod
    def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]:
        return {"hpu": ("cpu", "hpu")}

    @classmethod
    def get_nixl_memory_type(cls) -> str:
        if os.environ.get("VLLM_NIXL_DEVICE_TO_DEVICE", "0").lower() in ["1", "true"]:
            return "VRAM"
        else:
            return "DRAM"

    def is_sleep_mode_available(cls) -> bool:
        return True

    # Markers to track which env vars were auto-set by set_torch_compile()
    # in eager mode, so the lazy branch can remove them if they leaked
    # into a subprocess (e.g. via pytest plugin loading vllm_gaudi).
    _MARKER_RUNTIME_SCALE_PATCHING = '_VLLM_AUTOSET_RUNTIME_SCALE_PATCHING'
    _MARKER_FUSER_MULTI_THREADED = '_VLLM_AUTOSET_FUSER_MULTI_THREADED'

    @classmethod
    def set_torch_compile(cls) -> None:
        # NOTE: PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
        # does not support torch.compile
        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
        # torch.compile support

        # PT_HPU_WEIGHT_SHARING=0 is needed in both lazy and eager modes.
        # Only set if not already provided by the user.
        if os.environ.get('PT_HPU_WEIGHT_SHARING') is None:
            os.environ['PT_HPU_WEIGHT_SHARING'] = '0'
        is_lazy = htorch.utils.internal.is_lazy()
        if is_lazy:
            torch._dynamo.config.disable = True
            # NOTE multi-HPU inference with HPUGraphs (lazy-only)
            # requires enabling lazy collectives
            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html  # noqa: E501
            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
            # Remove eager-mode-only env vars that were auto-set by a prior
            # set_torch_compile() call (e.g. in a parent pytest process
            # that loaded vllm_gaudi as a plugin in eager mode).
            # User-explicitly-set values are left untouched.
            if os.environ.pop(cls._MARKER_RUNTIME_SCALE_PATCHING, None):
                os.environ.pop('RUNTIME_SCALE_PATCHING', None)
                logger.info("Removed inherited RUNTIME_SCALE_PATCHING "
                            "(auto-set by parent process in eager mode)")
            if os.environ.pop(cls._MARKER_FUSER_MULTI_THREADED, None):
                os.environ.pop('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS', None)
                logger.info("Removed inherited "
                            "FUSER_ENABLE_MULTI_THREADED_INVOCATIONS "
                            "(auto-set by parent process in eager mode)")
        else:
            # If not set by user then for torch compile enable Runtime scale patching by default
            if os.environ.get('RUNTIME_SCALE_PATCHING') is None:
                os.environ['RUNTIME_SCALE_PATCHING'] = '1'
                os.environ[cls._MARKER_RUNTIME_SCALE_PATCHING] = '1'
            #This allows for utilization of Parallel Compilation feature
            if os.environ.get('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS') is None:
                os.environ['FUSER_ENABLE_MULTI_THREADED_INVOCATIONS'] = '1'
                os.environ[cls._MARKER_FUSER_MULTI_THREADED] = '1'

    @classmethod
    def adjust_cuda_hooks(cls) -> None:
        torch.cuda.is_available = lambda: False
        # hpu.get_device_properties implementation is weird
        # cuda.get_device_properties implementation is correct
        # replace hpu.get_device_properties with cuda.get_device_properties
        torch.hpu.get_device_properties = torch.cuda.get_device_properties

    @classmethod
    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str, model_config: ModelConfig) -> bool:
        return kv_cache_dtype == "fp8_inc"

    @classmethod
    def use_sync_weight_loader(cls) -> bool:
        """
        Returns if the current platform needs to sync weight loader.
        """
        force_sync = os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC", "true").lower() in ("true", "1")
        return force_sync

    @classmethod
    def make_synced_weight_loader(cls, original_weight_loader):
        """
        Wrap the original weight loader to make it synced.
        """

        def _synced_weight_loader(param, *args, **kwargs):
            out = original_weight_loader(param, *args, **kwargs)
            torch.hpu.synchronize()
            return out

        return _synced_weight_loader

    @classmethod
    def insert_blocks_to_device(
        cls,
        src_cache: torch.Tensor,
        dst_cache: Union[tuple[torch.Tensor], torch.Tensor],
        src_block_indices: torch.Tensor,
        dst_block_indices: torch.Tensor,
    ) -> None:
        """Copy blocks from src_cache to dst_cache on HPU."""
        # WA: https://github.com/pytorch/pytorch/issues/169656
        original_src_dtype = src_cache.dtype
        view_as_uint = original_src_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
        if view_as_uint:
            src_cache = src_cache.view(torch.uint8)
        if isinstance(dst_cache, tuple):
            _src_cache = src_cache[:, src_block_indices]
            _src_cache = _src_cache.to(dst_cache[0].device)
            dst_cache[0].index_copy_(0, dst_block_indices,
                                     _src_cache[0].view(original_src_dtype) if view_as_uint else _src_cache[0])
            dst_cache[1].index_copy_(0, dst_block_indices,
                                     _src_cache[1].view(original_src_dtype) if view_as_uint else _src_cache[1])
        else:
            indexed_cache = src_cache[src_block_indices]
            if view_as_uint:
                indexed_cache = indexed_cache.view(original_src_dtype)
            dst_cache.index_copy_(0, dst_block_indices, indexed_cache.to(dst_cache.device))
        torch.hpu.synchronize()

    @classmethod
    def swap_out_blocks_to_host(
        cls,
        src_cache: Union[tuple[torch.Tensor], torch.Tensor],
        dst_cache: torch.Tensor,
        src_block_indices: torch.Tensor,
        dst_block_indices: torch.Tensor,
    ) -> None:
        """Copy blocks from HPU to host (CPU)."""
        if isinstance(src_cache, tuple):
            _src_cache = torch.stack([c[src_block_indices] for c in src_cache], dim=0)
            dst_cache[:, dst_block_indices] = _src_cache.cpu()
        else:
            dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()

    @classmethod
    def patch_for_pt27(cls) -> None:

        from vllm.utils.torch_utils import is_torch_equal_or_newer
        if is_torch_equal_or_newer("2.8.0"):
            return

        from vllm.model_executor import BasevLLMParameter
        parent_class = BasevLLMParameter.__mro__[1]
        parent_torch_function = getattr(parent_class, "__torch_function__", None)

        def torch_function(origin_cls, func, types, args=(), kwargs=None):
            if kwargs is None:
                kwargs = {}
            if parent_torch_function is None:
                return NotImplemented
            return parent_torch_function(func, types, args, kwargs)

        BasevLLMParameter.__torch_function__ = staticmethod(torch_function)  # type: ignore[assignment]
        return

_MARKER_FUSER_MULTI_THREADED `class-attribute` `instance-attribute` ¶

_MARKER_FUSER_MULTI_THREADED = (
    "_VLLM_AUTOSET_FUSER_MULTI_THREADED"
)

_MARKER_RUNTIME_SCALE_PATCHING `class-attribute` `instance-attribute` ¶

_MARKER_RUNTIME_SCALE_PATCHING = (
    "_VLLM_AUTOSET_RUNTIME_SCALE_PATCHING"
)

_enum `class-attribute` `instance-attribute` ¶

_enum = OOT

additional_env_vars `class-attribute` `instance-attribute` ¶

additional_env_vars = [
    k for k, v in (items()) if retain_envs(k)
]

device_control_env_var `class-attribute` `instance-attribute` ¶

device_control_env_var: str = 'HABANA_VISIBLE_MODULES'

device_name `class-attribute` `instance-attribute` ¶

device_name: str = 'hpu'

device_type `class-attribute` `instance-attribute` ¶

device_type: str = 'hpu'

dispatch_key `class-attribute` `instance-attribute` ¶

dispatch_key: str = 'HPU'

ray_device_key `class-attribute` `instance-attribute` ¶

ray_device_key: str = 'HPU'

simple_compile_backend `class-attribute` `instance-attribute` ¶

simple_compile_backend = 'hpu_backend'

supported_quantization `class-attribute` `instance-attribute` ¶

supported_quantization: list[str] = [
    "compressed-tensors",
    "fp8",
    "inc",
    "awq_hpu",
    "gptq_hpu",
    "modelopt",
]

adjust_cuda_hooks `classmethod` ¶

adjust_cuda_hooks() -> None

Source code in vllm_gaudi/platform.py

@classmethod
def adjust_cuda_hooks(cls) -> None:
    torch.cuda.is_available = lambda: False
    # hpu.get_device_properties implementation is weird
    # cuda.get_device_properties implementation is correct
    # replace hpu.get_device_properties with cuda.get_device_properties
    torch.hpu.get_device_properties = torch.cuda.get_device_properties

check_and_update_config `classmethod` ¶

check_and_update_config(vllm_config: VllmConfig) -> None

Source code in vllm_gaudi/platform.py

@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
    parallel_config = vllm_config.parallel_config

    if parallel_config.worker_cls == "auto":
        parallel_config.worker_cls = \
                "vllm_gaudi.v1.worker.hpu_worker.HPUWorker"

    # NOTE(kzawora): default block size for Gaudi should be 128
    # smaller sizes still work, but very inefficiently
    cache_config = vllm_config.cache_config
    if not cache_config.user_specified_block_size:
        cache_config.block_size = 128
    elif is_qwen3_5_hybrid_model(vllm_config.model_config) and cache_config.block_size != 128:
        # Narrow the reset to Qwen3.5 hybrids. Other hybrid models may
        # legitimately use a larger KV-manager block size and rely on
        # virtual block splitting down to 128-token HPU kernels.
        logger.info(
            "Resetting Qwen3.5 hybrid block_size from %d to 128 "
            "before Gaudi hybrid page-size realignment.",
            cache_config.block_size,
        )
        cache_config.block_size = 128
        if cache_config.mamba_cache_mode == "align":
            cache_config.mamba_block_size = 128
    # Hybrid GDN/Mamba models: upstream HybridAttentionMambaModelConfig
    # already ran and computed block_size / mamba_page_size_padded for
    # GPU.  HPU overrode block_size to 128 above, so we must re-align
    # mamba_page_size_padded to be a multiple of the HPU attention page
    # size (block_size * per-token KV bytes).  Without this the upstream
    # unify_kv_cache_spec_page_size() fails because the two page sizes
    # are not divisible.
    if (cache_config and cache_config.block_size is not None and vllm_config.model_config is not None
            and vllm_config.model_config.is_hybrid and cache_config.mamba_page_size_padded is not None):
        # Recompute mamba_page_size_padded so it is a multiple of
        # the HPU attention page size.
        from vllm.utils.torch_utils import get_dtype_size
        from math import ceil
        model_config = vllm_config.model_config
        if cache_config.cache_dtype == "auto":
            kv_dtype = model_config.dtype
        else:
            from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
            kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
        head_size = model_config.get_head_size()
        attn_page = (2 * cache_config.block_size * num_kv_heads * head_size * get_dtype_size(kv_dtype))
        if attn_page > 0 and cache_config.mamba_page_size_padded % attn_page != 0:
            old_padded = cache_config.mamba_page_size_padded
            cache_config.mamba_page_size_padded = (ceil(old_padded / attn_page) * attn_page)
            logger.info(
                "Rescaled mamba_page_size_padded from %d to %d "
                "to align with HPU attention page size %d "
                "(block_size=%d).",
                old_padded,
                cache_config.mamba_page_size_padded,
                attn_page,
                cache_config.block_size,
            )
    if (parallel_config.distributed_executor_backend in ['mp', 'uni']
            and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
        if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None:
            logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                           "might cause application hangs on exit. Using "
                           "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
                           "as it was explicitly requested.")
        else:
            logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                           "might cause application hangs on exit. Setting "
                           "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
                           "To override that behavior, please set "
                           "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

    if (vllm_config.model_config is not None and vllm_config.model_config.dtype in (torch.float16, torch.float32)):
        logger.warning("The HPU backend currently does not support %s. "
                       "Using bfloat16 instead.", vllm_config.model_config.dtype)
        vllm_config.model_config.dtype = torch.bfloat16

    from vllm.config import CompilationMode, CUDAGraphMode
    compilation_config = vllm_config.compilation_config
    # Activate custom ops for v1.
    compilation_config.custom_ops = ["all"]
    compilation_config.cudagraph_mode = CUDAGraphMode.NONE
    compilation_config.cudagraph_capture_sizes = []

    if get_config().VLLM_CONTIGUOUS_PA:
        logger.warning("Using Contiguous PA, disabling prefix caching")
        vllm_config.cache_config.enable_prefix_caching = False

    if (vllm_config.cache_config.enable_prefix_caching and vllm_config.cache_config.mamba_cache_mode == "all"):
        vllm_config.cache_config.mamba_cache_mode = "align"
        logger.info("[HPU] Overriding mamba_cache_mode from 'all' to 'align' "
                    "to ensure block-aligned chunked prefill splits.")

    if (vllm_config.model_config is not None and vllm_config.model_config.is_hybrid):
        logger.debug(
            "[HPU] Hybrid model cache config: block_size=%s, "
            "mamba_block_size=%s, mamba_cache_mode=%s, "
            "enable_prefix_caching=%s", cache_config.block_size, getattr(cache_config, "mamba_block_size", None),
            getattr(cache_config, "mamba_cache_mode", None), cache_config.enable_prefix_caching)

    if compilation_config.mode != CompilationMode.NONE:
        logger.info("[HPU] Forcing CompilationMode.NONE "
                    "compilation mode")
        compilation_config.mode = CompilationMode.NONE

    # Force CPU loading for INC quantization to prevent OOM during weight loading.
    # INC FP8 quantization requires weights to be loaded to CPU first, then
    # quantized and moved to device. Without this, weights are loaded directly
    # to HPU in BF16 which causes OOM for large models.
    model_config = vllm_config.model_config
    is_inc_quant = (model_config is not None and model_config.quantization == "inc") or os.getenv("QUANT_CONFIG")
    if is_inc_quant and vllm_config.load_config is not None and vllm_config.load_config.device is None:
        logger.info("[HPU] INC quantization detected, loading weights to CPU first")
        vllm_config.load_config.device = "cpu"

    # Disable multi-stream for shared experts as no Stream on CPU
    os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"

    # NOTE: vLLM has default enabled async scheduling with speculative decoding is on.
    # However, for HPU, speculative decoding is not supported with async scheduling.
    vllm_config.scheduler_config.async_scheduling = \
        vllm_config.scheduler_config.async_scheduling and vllm_config.speculative_config is None

get_attn_backend_cls `classmethod` ¶

get_attn_backend_cls(
    selected_backend: AttentionBackendEnum,
    attn_selector_config: AttentionSelectorConfig,
    num_heads: Optional[int] = None,
) -> str

Source code in vllm_gaudi/platform.py

@classmethod
def get_attn_backend_cls(
    cls,
    selected_backend: "AttentionBackendEnum",
    attn_selector_config: "AttentionSelectorConfig",
    num_heads: Optional[int] = None,
) -> str:
    from vllm.config import get_current_vllm_config
    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    current_vllm_config = get_current_vllm_config()
    if current_vllm_config.device_config.device_type == "cpu":
        logger.info("Using CPU_ATTN backend for CPU-targeted config.")
        return AttentionBackendEnum.CPU_ATTN.get_path()

    if attn_selector_config.use_sparse:
        raise NotImplementedError("Sparse Attention is not supported on HPU.")

    if attn_selector_config.use_mla:
        logger.info("Using HPUAttentionMLA backend.")
        return ("vllm_gaudi.attention.backends.hpu_attn."
                "HPUMLAAttentionBackend")

    logger.info("Using HPUAttentionV1 backend.")
    return ("vllm_gaudi.v1.attention.backends."
            "hpu_attn.HPUAttentionBackendV1")

get_device_communicator_cls `classmethod` ¶

get_device_communicator_cls() -> str

Source code in vllm_gaudi/platform.py

@classmethod
def get_device_communicator_cls(cls) -> str:
    return "vllm_gaudi.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa

get_device_name `classmethod` ¶

get_device_name(device_id: int = 0) -> str

Source code in vllm_gaudi/platform.py

@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
    return cls.device_name

get_device_total_memory `classmethod` ¶

get_device_total_memory(device_id: int = 0) -> int

Get the total memory of a device in bytes.

Source code in vllm_gaudi/platform.py

@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
    """Get the total memory of a device in bytes."""
    # NOTE: This is a workaround.
    # The correct implementation of the method in this place should look as follows:
    # total_hpu_memory = torch.hpu.mem_get_info()[1]
    # A value of 0 is returned to preserve the current logic in
    # vllm/vllm/engine/arg_utils.py → get_batch_defaults() →
    # default_max_num_batched_tokens, in order to avoid the
    # error in hpu_perf_test, while also preventing a
    # NotImplementedError in test_defaults_with_usage_context.
    logger.warning("This is a workaround! Please check the NOTE "
                   "in the get_device_total_memory definition.")

    total_hpu_memory = 0

    return total_hpu_memory

get_nixl_memory_type `classmethod` ¶

get_nixl_memory_type() -> str

Source code in vllm_gaudi/platform.py

@classmethod
def get_nixl_memory_type(cls) -> str:
    if os.environ.get("VLLM_NIXL_DEVICE_TO_DEVICE", "0").lower() in ["1", "true"]:
        return "VRAM"
    else:
        return "DRAM"

get_nixl_supported_devices `classmethod` ¶

get_nixl_supported_devices() -> dict[str, tuple[str, ...]]

Source code in vllm_gaudi/platform.py

@classmethod
def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]:
    return {"hpu": ("cpu", "hpu")}

get_punica_wrapper `classmethod` ¶

get_punica_wrapper() -> str

Source code in vllm_gaudi/platform.py

@classmethod
def get_punica_wrapper(cls) -> str:
    return "vllm_gaudi.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"

insert_blocks_to_device `classmethod` ¶

insert_blocks_to_device(
    src_cache: Tensor,
    dst_cache: Union[tuple[Tensor], Tensor],
    src_block_indices: Tensor,
    dst_block_indices: Tensor,
) -> None

Copy blocks from src_cache to dst_cache on HPU.

Source code in vllm_gaudi/platform.py

@classmethod
def insert_blocks_to_device(
    cls,
    src_cache: torch.Tensor,
    dst_cache: Union[tuple[torch.Tensor], torch.Tensor],
    src_block_indices: torch.Tensor,
    dst_block_indices: torch.Tensor,
) -> None:
    """Copy blocks from src_cache to dst_cache on HPU."""
    # WA: https://github.com/pytorch/pytorch/issues/169656
    original_src_dtype = src_cache.dtype
    view_as_uint = original_src_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
    if view_as_uint:
        src_cache = src_cache.view(torch.uint8)
    if isinstance(dst_cache, tuple):
        _src_cache = src_cache[:, src_block_indices]
        _src_cache = _src_cache.to(dst_cache[0].device)
        dst_cache[0].index_copy_(0, dst_block_indices,
                                 _src_cache[0].view(original_src_dtype) if view_as_uint else _src_cache[0])
        dst_cache[1].index_copy_(0, dst_block_indices,
                                 _src_cache[1].view(original_src_dtype) if view_as_uint else _src_cache[1])
    else:
        indexed_cache = src_cache[src_block_indices]
        if view_as_uint:
            indexed_cache = indexed_cache.view(original_src_dtype)
        dst_cache.index_copy_(0, dst_block_indices, indexed_cache.to(dst_cache.device))
    torch.hpu.synchronize()

is_async_output_supported `classmethod` ¶

is_async_output_supported(
    enforce_eager: Optional[bool],
) -> bool

Source code in vllm_gaudi/platform.py

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
    return True

is_kv_cache_dtype_supported `classmethod` ¶

is_kv_cache_dtype_supported(
    kv_cache_dtype: str, model_config: ModelConfig
) -> bool

Source code in vllm_gaudi/platform.py

@classmethod
def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str, model_config: ModelConfig) -> bool:
    return kv_cache_dtype == "fp8_inc"

is_pin_memory_available `classmethod` ¶

is_pin_memory_available()

Source code in vllm_gaudi/platform.py

@classmethod
def is_pin_memory_available(cls):
    logger.warning("Pin memory is not supported on HPU.")
    return False

is_sleep_mode_available ¶

is_sleep_mode_available() -> bool

Source code in vllm_gaudi/platform.py

def is_sleep_mode_available(cls) -> bool:
    return True

make_synced_weight_loader `classmethod` ¶

make_synced_weight_loader(original_weight_loader)

Wrap the original weight loader to make it synced.

Source code in vllm_gaudi/platform.py

@classmethod
def make_synced_weight_loader(cls, original_weight_loader):
    """
    Wrap the original weight loader to make it synced.
    """

    def _synced_weight_loader(param, *args, **kwargs):
        out = original_weight_loader(param, *args, **kwargs)
        torch.hpu.synchronize()
        return out

    return _synced_weight_loader

manual_seed_all `classmethod` ¶

manual_seed_all(seed: int) -> None

Source code in vllm_gaudi/platform.py

@classmethod
def manual_seed_all(cls, seed: int) -> None:
    torch.hpu.random.manual_seed_all(seed)

patch_for_pt27 `classmethod` ¶

patch_for_pt27() -> None

Source code in vllm_gaudi/platform.py

@classmethod
def patch_for_pt27(cls) -> None:

    from vllm.utils.torch_utils import is_torch_equal_or_newer
    if is_torch_equal_or_newer("2.8.0"):
        return

    from vllm.model_executor import BasevLLMParameter
    parent_class = BasevLLMParameter.__mro__[1]
    parent_torch_function = getattr(parent_class, "__torch_function__", None)

    def torch_function(origin_cls, func, types, args=(), kwargs=None):
        if kwargs is None:
            kwargs = {}
        if parent_torch_function is None:
            return NotImplemented
        return parent_torch_function(func, types, args, kwargs)

    BasevLLMParameter.__torch_function__ = staticmethod(torch_function)  # type: ignore[assignment]
    return

set_device `classmethod` ¶

set_device(device: device) -> None

Set the device for the current platform.

Source code in vllm_gaudi/platform.py

@classmethod
def set_device(cls, device: torch.device) -> None:
    """
    Set the device for the current platform.
    """
    return

set_torch_compile `classmethod` ¶

set_torch_compile() -> None

Source code in vllm_gaudi/platform.py

@classmethod
def set_torch_compile(cls) -> None:
    # NOTE: PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
    # does not support torch.compile
    # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
    # torch.compile support

    # PT_HPU_WEIGHT_SHARING=0 is needed in both lazy and eager modes.
    # Only set if not already provided by the user.
    if os.environ.get('PT_HPU_WEIGHT_SHARING') is None:
        os.environ['PT_HPU_WEIGHT_SHARING'] = '0'
    is_lazy = htorch.utils.internal.is_lazy()
    if is_lazy:
        torch._dynamo.config.disable = True
        # NOTE multi-HPU inference with HPUGraphs (lazy-only)
        # requires enabling lazy collectives
        # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html  # noqa: E501
        os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
        # Remove eager-mode-only env vars that were auto-set by a prior
        # set_torch_compile() call (e.g. in a parent pytest process
        # that loaded vllm_gaudi as a plugin in eager mode).
        # User-explicitly-set values are left untouched.
        if os.environ.pop(cls._MARKER_RUNTIME_SCALE_PATCHING, None):
            os.environ.pop('RUNTIME_SCALE_PATCHING', None)
            logger.info("Removed inherited RUNTIME_SCALE_PATCHING "
                        "(auto-set by parent process in eager mode)")
        if os.environ.pop(cls._MARKER_FUSER_MULTI_THREADED, None):
            os.environ.pop('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS', None)
            logger.info("Removed inherited "
                        "FUSER_ENABLE_MULTI_THREADED_INVOCATIONS "
                        "(auto-set by parent process in eager mode)")
    else:
        # If not set by user then for torch compile enable Runtime scale patching by default
        if os.environ.get('RUNTIME_SCALE_PATCHING') is None:
            os.environ['RUNTIME_SCALE_PATCHING'] = '1'
            os.environ[cls._MARKER_RUNTIME_SCALE_PATCHING] = '1'
        #This allows for utilization of Parallel Compilation feature
        if os.environ.get('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS') is None:
            os.environ['FUSER_ENABLE_MULTI_THREADED_INVOCATIONS'] = '1'
            os.environ[cls._MARKER_FUSER_MULTI_THREADED] = '1'

support_hybrid_kv_cache `classmethod` ¶

support_hybrid_kv_cache() -> bool

Source code in vllm_gaudi/platform.py

@classmethod
def support_hybrid_kv_cache(cls) -> bool:
    return True

supports_structured_output `classmethod` ¶

supports_structured_output() -> bool

Source code in vllm_gaudi/platform.py

@classmethod
def supports_structured_output(cls) -> bool:
    return True

supports_v1 `classmethod` ¶

supports_v1(model_config: ModelConfig) -> bool

Source code in vllm_gaudi/platform.py

@classmethod
def supports_v1(cls, model_config: ModelConfig) -> bool:
    # V1 support on HPU is experimental
    return True

swap_out_blocks_to_host `classmethod` ¶

swap_out_blocks_to_host(
    src_cache: Union[tuple[Tensor], Tensor],
    dst_cache: Tensor,
    src_block_indices: Tensor,
    dst_block_indices: Tensor,
) -> None

Copy blocks from HPU to host (CPU).

Source code in vllm_gaudi/platform.py

@classmethod
def swap_out_blocks_to_host(
    cls,
    src_cache: Union[tuple[torch.Tensor], torch.Tensor],
    dst_cache: torch.Tensor,
    src_block_indices: torch.Tensor,
    dst_block_indices: torch.Tensor,
) -> None:
    """Copy blocks from HPU to host (CPU)."""
    if isinstance(src_cache, tuple):
        _src_cache = torch.stack([c[src_block_indices] for c in src_cache], dim=0)
        dst_cache[:, dst_block_indices] = _src_cache.cpu()
    else:
        dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()

update_block_size_for_backend `classmethod` ¶

update_block_size_for_backend(
    vllm_config: VllmConfig,
) -> None

Source code in vllm_gaudi/platform.py

@classmethod
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:

    cache_config = vllm_config.cache_config
    model_config = vllm_config.model_config

    # For Granite 4.0-H (granitemoehybrid), we compute the correct
    # block_size in this method using the PC-aware alignment formula
    # (528 without prefix caching, 768 with prefix caching).
    # We set block_size before calling super and mark it as
    # user-specified so Phase 1 preserves it; Phase 2
    # (_align_hybrid_block_size) then validates and sets
    # mamba_page_size_padded.
    is_granite_hybrid = (model_config is not None
                         and getattr(model_config.hf_config, "model_type", None) == "granitemoehybrid")
    if is_granite_hybrid:
        # Compute the correct block_size using the PC-aware formula.
        from vllm.utils.math_utils import cdiv
        from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
        from vllm.model_executor.models import ModelRegistry
        if cache_config.cache_dtype == "auto":
            kv_dtype = model_config.dtype
        else:
            from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
            kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
        attn_1tok = FullAttentionSpec(
            block_size=1,
            num_kv_heads=model_config.get_num_kv_heads(vllm_config.parallel_config),
            head_size=model_config.get_head_size(),
            dtype=kv_dtype,
        ).page_size_bytes
        model_cls, _ = ModelRegistry.resolve_model_cls(
            model_config.architecture,
            model_config=model_config,
        )
        mamba_page_size = MambaSpec(
            shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
            dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
            block_size=-1,
        ).page_size_bytes
        if mamba_page_size > 0:
            if cache_config.enable_prefix_caching:
                mamba_chunk_size = getattr(model_config.hf_config, 'mamba_d_chunk', 256)
                alignment = mamba_chunk_size
            else:
                alignment = 16
            attn_block_size = alignment * cdiv(mamba_page_size, alignment * attn_1tok)
            cache_config.block_size = attn_block_size
            if cache_config.mamba_cache_mode == "align":
                cache_config.mamba_block_size = attn_block_size
            logger.info(
                "Setting granitemoehybrid block_size to %d tokens "
                "(alignment=%d, mamba_page_size=%d bytes, "
                "prefix_caching=%s).",
                attn_block_size,
                alignment,
                mamba_page_size,
                cache_config.enable_prefix_caching,
            )
        if not cache_config.user_specified_block_size:
            cache_config.user_specified_block_size = True
            super().update_block_size_for_backend(vllm_config)
            cache_config.user_specified_block_size = False
        else:
            super().update_block_size_for_backend(vllm_config)
    else:
        super().update_block_size_for_backend(vllm_config)

use_sync_weight_loader `classmethod` ¶

use_sync_weight_loader() -> bool

Returns if the current platform needs to sync weight loader.

Source code in vllm_gaudi/platform.py

@classmethod
def use_sync_weight_loader(cls) -> bool:
    """
    Returns if the current platform needs to sync weight loader.
    """
    force_sync = os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC", "true").lower() in ("true", "1")
    return force_sync

is_qwen3_5_hybrid_model ¶

is_qwen3_5_hybrid_model(
    model_config: Optional[ModelConfig],
) -> bool

Source code in vllm_gaudi/platform.py

def is_qwen3_5_hybrid_model(model_config: Optional[ModelConfig]) -> bool:
    if model_config is None or not model_config.is_hybrid:
        return False

    architectures = set(getattr(getattr(model_config, "hf_config", None), "architectures", []) or [])
    architecture = getattr(model_config, "architecture", None)
    if architecture is not None:
        architectures.add(architecture)

    return any(arch in QWEN3_5_HYBRID_ARCHS for arch in architectures)

retain_envs ¶

retain_envs(var_name)

Source code in vllm_gaudi/platform.py

def retain_envs(var_name):
    retain_var_list = ['GLOO_SOCKET_IFNAME', 'HCCL_SOCKET_IFNAME', 'NCCL_SOCKET_IFNAME']
    return ('HPU' in var_name or 'RAY' in var_name or 'VLLM' in var_name or var_name in retain_var_list)

vllm_gaudi.platform ¶

QWEN3_5_HYBRID_ARCHS module-attribute ¶

logger module-attribute ¶

HpuPlatform ¶

_MARKER_FUSER_MULTI_THREADED class-attribute instance-attribute ¶

_MARKER_RUNTIME_SCALE_PATCHING class-attribute instance-attribute ¶

_enum class-attribute instance-attribute ¶

additional_env_vars class-attribute instance-attribute ¶

device_control_env_var class-attribute instance-attribute ¶

device_name class-attribute instance-attribute ¶

device_type class-attribute instance-attribute ¶

dispatch_key class-attribute instance-attribute ¶

ray_device_key class-attribute instance-attribute ¶

simple_compile_backend class-attribute instance-attribute ¶

supported_quantization class-attribute instance-attribute ¶

adjust_cuda_hooks classmethod ¶

check_and_update_config classmethod ¶

get_attn_backend_cls classmethod ¶

get_device_communicator_cls classmethod ¶

get_device_name classmethod ¶

get_device_total_memory classmethod ¶

get_nixl_memory_type classmethod ¶

get_nixl_supported_devices classmethod ¶

get_punica_wrapper classmethod ¶

insert_blocks_to_device classmethod ¶

is_async_output_supported classmethod ¶

is_kv_cache_dtype_supported classmethod ¶

is_pin_memory_available classmethod ¶

is_sleep_mode_available ¶

make_synced_weight_loader classmethod ¶

manual_seed_all classmethod ¶

patch_for_pt27 classmethod ¶

set_device classmethod ¶

set_torch_compile classmethod ¶

support_hybrid_kv_cache classmethod ¶

supports_structured_output classmethod ¶

supports_v1 classmethod ¶

swap_out_blocks_to_host classmethod ¶

update_block_size_for_backend classmethod ¶

use_sync_weight_loader classmethod ¶

is_qwen3_5_hybrid_model ¶

retain_envs ¶

QWEN3_5_HYBRID_ARCHS `module-attribute` ¶

logger `module-attribute` ¶

_MARKER_FUSER_MULTI_THREADED `class-attribute` `instance-attribute` ¶

_MARKER_RUNTIME_SCALE_PATCHING `class-attribute` `instance-attribute` ¶

_enum `class-attribute` `instance-attribute` ¶

additional_env_vars `class-attribute` `instance-attribute` ¶

device_control_env_var `class-attribute` `instance-attribute` ¶

device_name `class-attribute` `instance-attribute` ¶

device_type `class-attribute` `instance-attribute` ¶

dispatch_key `class-attribute` `instance-attribute` ¶

ray_device_key `class-attribute` `instance-attribute` ¶

simple_compile_backend `class-attribute` `instance-attribute` ¶

supported_quantization `class-attribute` `instance-attribute` ¶

adjust_cuda_hooks `classmethod` ¶

check_and_update_config `classmethod` ¶

get_attn_backend_cls `classmethod` ¶

get_device_communicator_cls `classmethod` ¶

get_device_name `classmethod` ¶

get_device_total_memory `classmethod` ¶

get_nixl_memory_type `classmethod` ¶

get_nixl_supported_devices `classmethod` ¶

get_punica_wrapper `classmethod` ¶

insert_blocks_to_device `classmethod` ¶

is_async_output_supported `classmethod` ¶

is_kv_cache_dtype_supported `classmethod` ¶

is_pin_memory_available `classmethod` ¶

make_synced_weight_loader `classmethod` ¶

manual_seed_all `classmethod` ¶

patch_for_pt27 `classmethod` ¶

set_device `classmethod` ¶

set_torch_compile `classmethod` ¶

support_hybrid_kv_cache `classmethod` ¶

supports_structured_output `classmethod` ¶

supports_v1 `classmethod` ¶

swap_out_blocks_to_host `classmethod` ¶

update_block_size_for_backend `classmethod` ¶

use_sync_weight_loader `classmethod` ¶