Skip to content

vllm_gaudi.platform

QWEN3_5_HYBRID_ARCHS module-attribute

QWEN3_5_HYBRID_ARCHS = frozenset(
    {
        "Qwen3_5ForConditionalGeneration",
        "Qwen3_5MoeForConditionalGeneration",
    }
)

logger module-attribute

logger = logger()

HpuPlatform

Bases: Platform

Source code in vllm_gaudi/platform.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
class HpuPlatform(Platform):
    _enum = PlatformEnum.OOT
    device_name: str = "hpu"
    device_type: str = "hpu"
    dispatch_key: str = "HPU"
    ray_device_key: str = "HPU"
    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
    supported_quantization: list[str] = ["compressed-tensors", "fp8", "inc", "awq_hpu", "gptq_hpu", "modelopt"]
    simple_compile_backend = "hpu_backend"
    additional_env_vars = [k for k, v in os.environ.items() if retain_envs(k)]

    @classmethod
    def get_attn_backend_cls(
        cls,
        selected_backend: "AttentionBackendEnum",
        attn_selector_config: "AttentionSelectorConfig",
        num_heads: Optional[int] = None,
    ) -> str:
        from vllm.config import get_current_vllm_config
        from vllm.v1.attention.backends.registry import AttentionBackendEnum

        current_vllm_config = get_current_vllm_config()
        if current_vllm_config.device_config.device_type == "cpu":
            logger.info("Using CPU_ATTN backend for CPU-targeted config.")
            return AttentionBackendEnum.CPU_ATTN.get_path()

        if attn_selector_config.use_sparse:
            raise NotImplementedError("Sparse Attention is not supported on HPU.")

        if attn_selector_config.use_mla:
            logger.info("Using HPUAttentionMLA backend.")
            return ("vllm_gaudi.attention.backends.hpu_attn."
                    "HPUMLAAttentionBackend")

        logger.info("Using HPUAttentionV1 backend.")
        return ("vllm_gaudi.v1.attention.backends."
                "hpu_attn.HPUAttentionBackendV1")

    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        return True

    @classmethod
    def set_device(cls, device: torch.device) -> None:
        """
        Set the device for the current platform.
        """
        return

    @classmethod
    def manual_seed_all(cls, seed: int) -> None:
        torch.hpu.random.manual_seed_all(seed)

    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        return cls.device_name

    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        """Get the total memory of a device in bytes."""
        # NOTE: This is a workaround.
        # The correct implementation of the method in this place should look as follows:
        # total_hpu_memory = torch.hpu.mem_get_info()[1]
        # A value of 0 is returned to preserve the current logic in
        # vllm/vllm/engine/arg_utils.py → get_batch_defaults() →
        # default_max_num_batched_tokens, in order to avoid the
        # error in hpu_perf_test, while also preventing a
        # NotImplementedError in test_defaults_with_usage_context.
        logger.warning("This is a workaround! Please check the NOTE "
                       "in the get_device_total_memory definition.")

        total_hpu_memory = 0

        return total_hpu_memory

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        parallel_config = vllm_config.parallel_config

        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = \
                    "vllm_gaudi.v1.worker.hpu_worker.HPUWorker"

        # NOTE(kzawora): default block size for Gaudi should be 128
        # smaller sizes still work, but very inefficiently
        cache_config = vllm_config.cache_config
        if not cache_config.user_specified_block_size:
            cache_config.block_size = 128
        elif is_qwen3_5_hybrid_model(vllm_config.model_config) and cache_config.block_size != 128:
            # Narrow the reset to Qwen3.5 hybrids. Other hybrid models may
            # legitimately use a larger KV-manager block size and rely on
            # virtual block splitting down to 128-token HPU kernels.
            logger.info(
                "Resetting Qwen3.5 hybrid block_size from %d to 128 "
                "before Gaudi hybrid page-size realignment.",
                cache_config.block_size,
            )
            cache_config.block_size = 128
            if cache_config.mamba_cache_mode == "align":
                cache_config.mamba_block_size = 128
        # Hybrid GDN/Mamba models: upstream HybridAttentionMambaModelConfig
        # already ran and computed block_size / mamba_page_size_padded for
        # GPU.  HPU overrode block_size to 128 above, so we must re-align
        # mamba_page_size_padded to be a multiple of the HPU attention page
        # size (block_size * per-token KV bytes).  Without this the upstream
        # unify_kv_cache_spec_page_size() fails because the two page sizes
        # are not divisible.
        if (cache_config and cache_config.block_size is not None and vllm_config.model_config is not None
                and vllm_config.model_config.is_hybrid and cache_config.mamba_page_size_padded is not None):
            # Recompute mamba_page_size_padded so it is a multiple of
            # the HPU attention page size.
            from vllm.utils.torch_utils import get_dtype_size
            from math import ceil
            model_config = vllm_config.model_config
            if cache_config.cache_dtype == "auto":
                kv_dtype = model_config.dtype
            else:
                from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
                kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
            num_kv_heads = model_config.get_num_kv_heads(parallel_config)
            head_size = model_config.get_head_size()
            attn_page = (2 * cache_config.block_size * num_kv_heads * head_size * get_dtype_size(kv_dtype))
            if attn_page > 0 and cache_config.mamba_page_size_padded % attn_page != 0:
                old_padded = cache_config.mamba_page_size_padded
                cache_config.mamba_page_size_padded = (ceil(old_padded / attn_page) * attn_page)
                logger.info(
                    "Rescaled mamba_page_size_padded from %d to %d "
                    "to align with HPU attention page size %d "
                    "(block_size=%d).",
                    old_padded,
                    cache_config.mamba_page_size_padded,
                    attn_page,
                    cache_config.block_size,
                )
        if (parallel_config.distributed_executor_backend in ['mp', 'uni']
                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None:
                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                               "might cause application hangs on exit. Using "
                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
                               "as it was explicitly requested.")
            else:
                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                               "might cause application hangs on exit. Setting "
                               "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
                               "To override that behavior, please set "
                               "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

        if (vllm_config.model_config is not None and vllm_config.model_config.dtype in (torch.float16, torch.float32)):
            logger.warning("The HPU backend currently does not support %s. "
                           "Using bfloat16 instead.", vllm_config.model_config.dtype)
            vllm_config.model_config.dtype = torch.bfloat16

        from vllm.config import CompilationMode, CUDAGraphMode
        compilation_config = vllm_config.compilation_config
        # Activate custom ops for v1.
        compilation_config.custom_ops = ["all"]
        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
        compilation_config.cudagraph_capture_sizes = []

        if get_config().VLLM_CONTIGUOUS_PA:
            logger.warning("Using Contiguous PA, disabling prefix caching")
            vllm_config.cache_config.enable_prefix_caching = False

        if (vllm_config.cache_config.enable_prefix_caching and vllm_config.cache_config.mamba_cache_mode == "all"):
            vllm_config.cache_config.mamba_cache_mode = "align"
            logger.info("[HPU] Overriding mamba_cache_mode from 'all' to 'align' "
                        "to ensure block-aligned chunked prefill splits.")

        if (vllm_config.model_config is not None and vllm_config.model_config.is_hybrid):
            logger.debug(
                "[HPU] Hybrid model cache config: block_size=%s, "
                "mamba_block_size=%s, mamba_cache_mode=%s, "
                "enable_prefix_caching=%s", cache_config.block_size, getattr(cache_config, "mamba_block_size", None),
                getattr(cache_config, "mamba_cache_mode", None), cache_config.enable_prefix_caching)

        if compilation_config.mode != CompilationMode.NONE:
            logger.info("[HPU] Forcing CompilationMode.NONE "
                        "compilation mode")
            compilation_config.mode = CompilationMode.NONE

        # Force CPU loading for INC quantization to prevent OOM during weight loading.
        # INC FP8 quantization requires weights to be loaded to CPU first, then
        # quantized and moved to device. Without this, weights are loaded directly
        # to HPU in BF16 which causes OOM for large models.
        model_config = vllm_config.model_config
        is_inc_quant = (model_config is not None and model_config.quantization == "inc") or os.getenv("QUANT_CONFIG")
        if is_inc_quant and vllm_config.load_config is not None and vllm_config.load_config.device is None:
            logger.info("[HPU] INC quantization detected, loading weights to CPU first")
            vllm_config.load_config.device = "cpu"

        # Disable multi-stream for shared experts as no Stream on CPU
        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"

        # NOTE: vLLM has default enabled async scheduling with speculative decoding is on.
        # However, for HPU, speculative decoding is not supported with async scheduling.
        vllm_config.scheduler_config.async_scheduling = \
            vllm_config.scheduler_config.async_scheduling and vllm_config.speculative_config is None

    @classmethod
    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:

        cache_config = vllm_config.cache_config
        model_config = vllm_config.model_config

        # For Granite 4.0-H (granitemoehybrid), we compute the correct
        # block_size in this method using the PC-aware alignment formula
        # (528 without prefix caching, 768 with prefix caching).
        # We set block_size before calling super and mark it as
        # user-specified so Phase 1 preserves it; Phase 2
        # (_align_hybrid_block_size) then validates and sets
        # mamba_page_size_padded.
        is_granite_hybrid = (model_config is not None
                             and getattr(model_config.hf_config, "model_type", None) == "granitemoehybrid")
        if is_granite_hybrid:
            # Compute the correct block_size using the PC-aware formula.
            from vllm.utils.math_utils import cdiv
            from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
            from vllm.model_executor.models import ModelRegistry
            if cache_config.cache_dtype == "auto":
                kv_dtype = model_config.dtype
            else:
                from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
                kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
            attn_1tok = FullAttentionSpec(
                block_size=1,
                num_kv_heads=model_config.get_num_kv_heads(vllm_config.parallel_config),
                head_size=model_config.get_head_size(),
                dtype=kv_dtype,
            ).page_size_bytes
            model_cls, _ = ModelRegistry.resolve_model_cls(
                model_config.architecture,
                model_config=model_config,
            )
            mamba_page_size = MambaSpec(
                shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
                dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
                block_size=-1,
            ).page_size_bytes
            if mamba_page_size > 0:
                if cache_config.enable_prefix_caching:
                    mamba_chunk_size = getattr(model_config.hf_config, 'mamba_d_chunk', 256)
                    alignment = mamba_chunk_size
                else:
                    alignment = 16
                attn_block_size = alignment * cdiv(mamba_page_size, alignment * attn_1tok)
                cache_config.block_size = attn_block_size
                if cache_config.mamba_cache_mode == "align":
                    cache_config.mamba_block_size = attn_block_size
                logger.info(
                    "Setting granitemoehybrid block_size to %d tokens "
                    "(alignment=%d, mamba_page_size=%d bytes, "
                    "prefix_caching=%s).",
                    attn_block_size,
                    alignment,
                    mamba_page_size,
                    cache_config.enable_prefix_caching,
                )
            if not cache_config.user_specified_block_size:
                cache_config.user_specified_block_size = True
                super().update_block_size_for_backend(vllm_config)
                cache_config.user_specified_block_size = False
            else:
                super().update_block_size_for_backend(vllm_config)
        else:
            super().update_block_size_for_backend(vllm_config)

    @classmethod
    def is_pin_memory_available(cls):
        logger.warning("Pin memory is not supported on HPU.")
        return False

    @classmethod
    def get_punica_wrapper(cls) -> str:
        return "vllm_gaudi.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"

    @classmethod
    def support_hybrid_kv_cache(cls) -> bool:
        return True

    @classmethod
    def get_device_communicator_cls(cls) -> str:
        return "vllm_gaudi.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa

    @classmethod
    def supports_structured_output(cls) -> bool:
        return True

    @classmethod
    def supports_v1(cls, model_config: ModelConfig) -> bool:
        # V1 support on HPU is experimental
        return True

    @classmethod
    def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]:
        return {"hpu": ("cpu", "hpu")}

    @classmethod
    def get_nixl_memory_type(cls) -> str:
        if os.environ.get("VLLM_NIXL_DEVICE_TO_DEVICE", "0").lower() in ["1", "true"]:
            return "VRAM"
        else:
            return "DRAM"

    def is_sleep_mode_available(cls) -> bool:
        return True

    # Markers to track which env vars were auto-set by set_torch_compile()
    # in eager mode, so the lazy branch can remove them if they leaked
    # into a subprocess (e.g. via pytest plugin loading vllm_gaudi).
    _MARKER_RUNTIME_SCALE_PATCHING = '_VLLM_AUTOSET_RUNTIME_SCALE_PATCHING'
    _MARKER_FUSER_MULTI_THREADED = '_VLLM_AUTOSET_FUSER_MULTI_THREADED'

    @classmethod
    def set_torch_compile(cls) -> None:
        # NOTE: PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
        # does not support torch.compile
        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
        # torch.compile support

        # PT_HPU_WEIGHT_SHARING=0 is needed in both lazy and eager modes.
        # Only set if not already provided by the user.
        if os.environ.get('PT_HPU_WEIGHT_SHARING') is None:
            os.environ['PT_HPU_WEIGHT_SHARING'] = '0'
        is_lazy = htorch.utils.internal.is_lazy()
        if is_lazy:
            torch._dynamo.config.disable = True
            # NOTE multi-HPU inference with HPUGraphs (lazy-only)
            # requires enabling lazy collectives
            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html  # noqa: E501
            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
            # Remove eager-mode-only env vars that were auto-set by a prior
            # set_torch_compile() call (e.g. in a parent pytest process
            # that loaded vllm_gaudi as a plugin in eager mode).
            # User-explicitly-set values are left untouched.
            if os.environ.pop(cls._MARKER_RUNTIME_SCALE_PATCHING, None):
                os.environ.pop('RUNTIME_SCALE_PATCHING', None)
                logger.info("Removed inherited RUNTIME_SCALE_PATCHING "
                            "(auto-set by parent process in eager mode)")
            if os.environ.pop(cls._MARKER_FUSER_MULTI_THREADED, None):
                os.environ.pop('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS', None)
                logger.info("Removed inherited "
                            "FUSER_ENABLE_MULTI_THREADED_INVOCATIONS "
                            "(auto-set by parent process in eager mode)")
        else:
            # If not set by user then for torch compile enable Runtime scale patching by default
            if os.environ.get('RUNTIME_SCALE_PATCHING') is None:
                os.environ['RUNTIME_SCALE_PATCHING'] = '1'
                os.environ[cls._MARKER_RUNTIME_SCALE_PATCHING] = '1'
            #This allows for utilization of Parallel Compilation feature
            if os.environ.get('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS') is None:
                os.environ['FUSER_ENABLE_MULTI_THREADED_INVOCATIONS'] = '1'
                os.environ[cls._MARKER_FUSER_MULTI_THREADED] = '1'

    @classmethod
    def adjust_cuda_hooks(cls) -> None:
        torch.cuda.is_available = lambda: False
        # hpu.get_device_properties implementation is weird
        # cuda.get_device_properties implementation is correct
        # replace hpu.get_device_properties with cuda.get_device_properties
        torch.hpu.get_device_properties = torch.cuda.get_device_properties

    @classmethod
    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str, model_config: ModelConfig) -> bool:
        return kv_cache_dtype == "fp8_inc"

    @classmethod
    def use_sync_weight_loader(cls) -> bool:
        """
        Returns if the current platform needs to sync weight loader.
        """
        force_sync = os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC", "true").lower() in ("true", "1")
        return force_sync

    @classmethod
    def make_synced_weight_loader(cls, original_weight_loader):
        """
        Wrap the original weight loader to make it synced.
        """

        def _synced_weight_loader(param, *args, **kwargs):
            out = original_weight_loader(param, *args, **kwargs)
            torch.hpu.synchronize()
            return out

        return _synced_weight_loader

    @classmethod
    def insert_blocks_to_device(
        cls,
        src_cache: torch.Tensor,
        dst_cache: Union[tuple[torch.Tensor], torch.Tensor],
        src_block_indices: torch.Tensor,
        dst_block_indices: torch.Tensor,
    ) -> None:
        """Copy blocks from src_cache to dst_cache on HPU."""
        # WA: https://github.com/pytorch/pytorch/issues/169656
        original_src_dtype = src_cache.dtype
        view_as_uint = original_src_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
        if view_as_uint:
            src_cache = src_cache.view(torch.uint8)
        if isinstance(dst_cache, tuple):
            _src_cache = src_cache[:, src_block_indices]
            _src_cache = _src_cache.to(dst_cache[0].device)
            dst_cache[0].index_copy_(0, dst_block_indices,
                                     _src_cache[0].view(original_src_dtype) if view_as_uint else _src_cache[0])
            dst_cache[1].index_copy_(0, dst_block_indices,
                                     _src_cache[1].view(original_src_dtype) if view_as_uint else _src_cache[1])
        else:
            indexed_cache = src_cache[src_block_indices]
            if view_as_uint:
                indexed_cache = indexed_cache.view(original_src_dtype)
            dst_cache.index_copy_(0, dst_block_indices, indexed_cache.to(dst_cache.device))
        torch.hpu.synchronize()

    @classmethod
    def swap_out_blocks_to_host(
        cls,
        src_cache: Union[tuple[torch.Tensor], torch.Tensor],
        dst_cache: torch.Tensor,
        src_block_indices: torch.Tensor,
        dst_block_indices: torch.Tensor,
    ) -> None:
        """Copy blocks from HPU to host (CPU)."""
        if isinstance(src_cache, tuple):
            _src_cache = torch.stack([c[src_block_indices] for c in src_cache], dim=0)
            dst_cache[:, dst_block_indices] = _src_cache.cpu()
        else:
            dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()

    @classmethod
    def patch_for_pt27(cls) -> None:

        from vllm.utils.torch_utils import is_torch_equal_or_newer
        if is_torch_equal_or_newer("2.8.0"):
            return

        from vllm.model_executor import BasevLLMParameter
        parent_class = BasevLLMParameter.__mro__[1]
        parent_torch_function = getattr(parent_class, "__torch_function__", None)

        def torch_function(origin_cls, func, types, args=(), kwargs=None):
            if kwargs is None:
                kwargs = {}
            if parent_torch_function is None:
                return NotImplemented
            return parent_torch_function(func, types, args, kwargs)

        BasevLLMParameter.__torch_function__ = staticmethod(torch_function)  # type: ignore[assignment]
        return

_MARKER_FUSER_MULTI_THREADED class-attribute instance-attribute

_MARKER_FUSER_MULTI_THREADED = (
    "_VLLM_AUTOSET_FUSER_MULTI_THREADED"
)

_MARKER_RUNTIME_SCALE_PATCHING class-attribute instance-attribute

_MARKER_RUNTIME_SCALE_PATCHING = (
    "_VLLM_AUTOSET_RUNTIME_SCALE_PATCHING"
)

_enum class-attribute instance-attribute

_enum = OOT

additional_env_vars class-attribute instance-attribute

additional_env_vars = [
    k for k, v in (items()) if retain_envs(k)
]

device_control_env_var class-attribute instance-attribute

device_control_env_var: str = 'HABANA_VISIBLE_MODULES'

device_name class-attribute instance-attribute

device_name: str = 'hpu'

device_type class-attribute instance-attribute

device_type: str = 'hpu'

dispatch_key class-attribute instance-attribute

dispatch_key: str = 'HPU'

ray_device_key class-attribute instance-attribute

ray_device_key: str = 'HPU'

simple_compile_backend class-attribute instance-attribute

simple_compile_backend = 'hpu_backend'

supported_quantization class-attribute instance-attribute

supported_quantization: list[str] = [
    "compressed-tensors",
    "fp8",
    "inc",
    "awq_hpu",
    "gptq_hpu",
    "modelopt",
]

adjust_cuda_hooks classmethod

adjust_cuda_hooks() -> None
Source code in vllm_gaudi/platform.py
@classmethod
def adjust_cuda_hooks(cls) -> None:
    torch.cuda.is_available = lambda: False
    # hpu.get_device_properties implementation is weird
    # cuda.get_device_properties implementation is correct
    # replace hpu.get_device_properties with cuda.get_device_properties
    torch.hpu.get_device_properties = torch.cuda.get_device_properties

check_and_update_config classmethod

check_and_update_config(vllm_config: VllmConfig) -> None
Source code in vllm_gaudi/platform.py
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
    parallel_config = vllm_config.parallel_config

    if parallel_config.worker_cls == "auto":
        parallel_config.worker_cls = \
                "vllm_gaudi.v1.worker.hpu_worker.HPUWorker"

    # NOTE(kzawora): default block size for Gaudi should be 128
    # smaller sizes still work, but very inefficiently
    cache_config = vllm_config.cache_config
    if not cache_config.user_specified_block_size:
        cache_config.block_size = 128
    elif is_qwen3_5_hybrid_model(vllm_config.model_config) and cache_config.block_size != 128:
        # Narrow the reset to Qwen3.5 hybrids. Other hybrid models may
        # legitimately use a larger KV-manager block size and rely on
        # virtual block splitting down to 128-token HPU kernels.
        logger.info(
            "Resetting Qwen3.5 hybrid block_size from %d to 128 "
            "before Gaudi hybrid page-size realignment.",
            cache_config.block_size,
        )
        cache_config.block_size = 128
        if cache_config.mamba_cache_mode == "align":
            cache_config.mamba_block_size = 128
    # Hybrid GDN/Mamba models: upstream HybridAttentionMambaModelConfig
    # already ran and computed block_size / mamba_page_size_padded for
    # GPU.  HPU overrode block_size to 128 above, so we must re-align
    # mamba_page_size_padded to be a multiple of the HPU attention page
    # size (block_size * per-token KV bytes).  Without this the upstream
    # unify_kv_cache_spec_page_size() fails because the two page sizes
    # are not divisible.
    if (cache_config and cache_config.block_size is not None and vllm_config.model_config is not None
            and vllm_config.model_config.is_hybrid and cache_config.mamba_page_size_padded is not None):
        # Recompute mamba_page_size_padded so it is a multiple of
        # the HPU attention page size.
        from vllm.utils.torch_utils import get_dtype_size
        from math import ceil
        model_config = vllm_config.model_config
        if cache_config.cache_dtype == "auto":
            kv_dtype = model_config.dtype
        else:
            from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
            kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
        head_size = model_config.get_head_size()
        attn_page = (2 * cache_config.block_size * num_kv_heads * head_size * get_dtype_size(kv_dtype))
        if attn_page > 0 and cache_config.mamba_page_size_padded % attn_page != 0:
            old_padded = cache_config.mamba_page_size_padded
            cache_config.mamba_page_size_padded = (ceil(old_padded / attn_page) * attn_page)
            logger.info(
                "Rescaled mamba_page_size_padded from %d to %d "
                "to align with HPU attention page size %d "
                "(block_size=%d).",
                old_padded,
                cache_config.mamba_page_size_padded,
                attn_page,
                cache_config.block_size,
            )
    if (parallel_config.distributed_executor_backend in ['mp', 'uni']
            and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
        if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None:
            logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                           "might cause application hangs on exit. Using "
                           "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
                           "as it was explicitly requested.")
        else:
            logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
                           "might cause application hangs on exit. Setting "
                           "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
                           "To override that behavior, please set "
                           "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

    if (vllm_config.model_config is not None and vllm_config.model_config.dtype in (torch.float16, torch.float32)):
        logger.warning("The HPU backend currently does not support %s. "
                       "Using bfloat16 instead.", vllm_config.model_config.dtype)
        vllm_config.model_config.dtype = torch.bfloat16

    from vllm.config import CompilationMode, CUDAGraphMode
    compilation_config = vllm_config.compilation_config
    # Activate custom ops for v1.
    compilation_config.custom_ops = ["all"]
    compilation_config.cudagraph_mode = CUDAGraphMode.NONE
    compilation_config.cudagraph_capture_sizes = []

    if get_config().VLLM_CONTIGUOUS_PA:
        logger.warning("Using Contiguous PA, disabling prefix caching")
        vllm_config.cache_config.enable_prefix_caching = False

    if (vllm_config.cache_config.enable_prefix_caching and vllm_config.cache_config.mamba_cache_mode == "all"):
        vllm_config.cache_config.mamba_cache_mode = "align"
        logger.info("[HPU] Overriding mamba_cache_mode from 'all' to 'align' "
                    "to ensure block-aligned chunked prefill splits.")

    if (vllm_config.model_config is not None and vllm_config.model_config.is_hybrid):
        logger.debug(
            "[HPU] Hybrid model cache config: block_size=%s, "
            "mamba_block_size=%s, mamba_cache_mode=%s, "
            "enable_prefix_caching=%s", cache_config.block_size, getattr(cache_config, "mamba_block_size", None),
            getattr(cache_config, "mamba_cache_mode", None), cache_config.enable_prefix_caching)

    if compilation_config.mode != CompilationMode.NONE:
        logger.info("[HPU] Forcing CompilationMode.NONE "
                    "compilation mode")
        compilation_config.mode = CompilationMode.NONE

    # Force CPU loading for INC quantization to prevent OOM during weight loading.
    # INC FP8 quantization requires weights to be loaded to CPU first, then
    # quantized and moved to device. Without this, weights are loaded directly
    # to HPU in BF16 which causes OOM for large models.
    model_config = vllm_config.model_config
    is_inc_quant = (model_config is not None and model_config.quantization == "inc") or os.getenv("QUANT_CONFIG")
    if is_inc_quant and vllm_config.load_config is not None and vllm_config.load_config.device is None:
        logger.info("[HPU] INC quantization detected, loading weights to CPU first")
        vllm_config.load_config.device = "cpu"

    # Disable multi-stream for shared experts as no Stream on CPU
    os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"

    # NOTE: vLLM has default enabled async scheduling with speculative decoding is on.
    # However, for HPU, speculative decoding is not supported with async scheduling.
    vllm_config.scheduler_config.async_scheduling = \
        vllm_config.scheduler_config.async_scheduling and vllm_config.speculative_config is None

get_attn_backend_cls classmethod

get_attn_backend_cls(
    selected_backend: AttentionBackendEnum,
    attn_selector_config: AttentionSelectorConfig,
    num_heads: Optional[int] = None,
) -> str
Source code in vllm_gaudi/platform.py
@classmethod
def get_attn_backend_cls(
    cls,
    selected_backend: "AttentionBackendEnum",
    attn_selector_config: "AttentionSelectorConfig",
    num_heads: Optional[int] = None,
) -> str:
    from vllm.config import get_current_vllm_config
    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    current_vllm_config = get_current_vllm_config()
    if current_vllm_config.device_config.device_type == "cpu":
        logger.info("Using CPU_ATTN backend for CPU-targeted config.")
        return AttentionBackendEnum.CPU_ATTN.get_path()

    if attn_selector_config.use_sparse:
        raise NotImplementedError("Sparse Attention is not supported on HPU.")

    if attn_selector_config.use_mla:
        logger.info("Using HPUAttentionMLA backend.")
        return ("vllm_gaudi.attention.backends.hpu_attn."
                "HPUMLAAttentionBackend")

    logger.info("Using HPUAttentionV1 backend.")
    return ("vllm_gaudi.v1.attention.backends."
            "hpu_attn.HPUAttentionBackendV1")

get_device_communicator_cls classmethod

get_device_communicator_cls() -> str
Source code in vllm_gaudi/platform.py
@classmethod
def get_device_communicator_cls(cls) -> str:
    return "vllm_gaudi.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa

get_device_name classmethod

get_device_name(device_id: int = 0) -> str
Source code in vllm_gaudi/platform.py
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
    return cls.device_name

get_device_total_memory classmethod

get_device_total_memory(device_id: int = 0) -> int

Get the total memory of a device in bytes.

Source code in vllm_gaudi/platform.py
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
    """Get the total memory of a device in bytes."""
    # NOTE: This is a workaround.
    # The correct implementation of the method in this place should look as follows:
    # total_hpu_memory = torch.hpu.mem_get_info()[1]
    # A value of 0 is returned to preserve the current logic in
    # vllm/vllm/engine/arg_utils.py → get_batch_defaults() →
    # default_max_num_batched_tokens, in order to avoid the
    # error in hpu_perf_test, while also preventing a
    # NotImplementedError in test_defaults_with_usage_context.
    logger.warning("This is a workaround! Please check the NOTE "
                   "in the get_device_total_memory definition.")

    total_hpu_memory = 0

    return total_hpu_memory

get_nixl_memory_type classmethod

get_nixl_memory_type() -> str
Source code in vllm_gaudi/platform.py
@classmethod
def get_nixl_memory_type(cls) -> str:
    if os.environ.get("VLLM_NIXL_DEVICE_TO_DEVICE", "0").lower() in ["1", "true"]:
        return "VRAM"
    else:
        return "DRAM"

get_nixl_supported_devices classmethod

get_nixl_supported_devices() -> dict[str, tuple[str, ...]]
Source code in vllm_gaudi/platform.py
@classmethod
def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]:
    return {"hpu": ("cpu", "hpu")}

get_punica_wrapper classmethod

get_punica_wrapper() -> str
Source code in vllm_gaudi/platform.py
@classmethod
def get_punica_wrapper(cls) -> str:
    return "vllm_gaudi.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"

insert_blocks_to_device classmethod

insert_blocks_to_device(
    src_cache: Tensor,
    dst_cache: Union[tuple[Tensor], Tensor],
    src_block_indices: Tensor,
    dst_block_indices: Tensor,
) -> None

Copy blocks from src_cache to dst_cache on HPU.

Source code in vllm_gaudi/platform.py
@classmethod
def insert_blocks_to_device(
    cls,
    src_cache: torch.Tensor,
    dst_cache: Union[tuple[torch.Tensor], torch.Tensor],
    src_block_indices: torch.Tensor,
    dst_block_indices: torch.Tensor,
) -> None:
    """Copy blocks from src_cache to dst_cache on HPU."""
    # WA: https://github.com/pytorch/pytorch/issues/169656
    original_src_dtype = src_cache.dtype
    view_as_uint = original_src_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
    if view_as_uint:
        src_cache = src_cache.view(torch.uint8)
    if isinstance(dst_cache, tuple):
        _src_cache = src_cache[:, src_block_indices]
        _src_cache = _src_cache.to(dst_cache[0].device)
        dst_cache[0].index_copy_(0, dst_block_indices,
                                 _src_cache[0].view(original_src_dtype) if view_as_uint else _src_cache[0])
        dst_cache[1].index_copy_(0, dst_block_indices,
                                 _src_cache[1].view(original_src_dtype) if view_as_uint else _src_cache[1])
    else:
        indexed_cache = src_cache[src_block_indices]
        if view_as_uint:
            indexed_cache = indexed_cache.view(original_src_dtype)
        dst_cache.index_copy_(0, dst_block_indices, indexed_cache.to(dst_cache.device))
    torch.hpu.synchronize()

is_async_output_supported classmethod

is_async_output_supported(
    enforce_eager: Optional[bool],
) -> bool
Source code in vllm_gaudi/platform.py
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
    return True

is_kv_cache_dtype_supported classmethod

is_kv_cache_dtype_supported(
    kv_cache_dtype: str, model_config: ModelConfig
) -> bool
Source code in vllm_gaudi/platform.py
@classmethod
def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str, model_config: ModelConfig) -> bool:
    return kv_cache_dtype == "fp8_inc"

is_pin_memory_available classmethod

is_pin_memory_available()
Source code in vllm_gaudi/platform.py
@classmethod
def is_pin_memory_available(cls):
    logger.warning("Pin memory is not supported on HPU.")
    return False

is_sleep_mode_available

is_sleep_mode_available() -> bool
Source code in vllm_gaudi/platform.py
def is_sleep_mode_available(cls) -> bool:
    return True

make_synced_weight_loader classmethod

make_synced_weight_loader(original_weight_loader)

Wrap the original weight loader to make it synced.

Source code in vllm_gaudi/platform.py
@classmethod
def make_synced_weight_loader(cls, original_weight_loader):
    """
    Wrap the original weight loader to make it synced.
    """

    def _synced_weight_loader(param, *args, **kwargs):
        out = original_weight_loader(param, *args, **kwargs)
        torch.hpu.synchronize()
        return out

    return _synced_weight_loader

manual_seed_all classmethod

manual_seed_all(seed: int) -> None
Source code in vllm_gaudi/platform.py
@classmethod
def manual_seed_all(cls, seed: int) -> None:
    torch.hpu.random.manual_seed_all(seed)

patch_for_pt27 classmethod

patch_for_pt27() -> None
Source code in vllm_gaudi/platform.py
@classmethod
def patch_for_pt27(cls) -> None:

    from vllm.utils.torch_utils import is_torch_equal_or_newer
    if is_torch_equal_or_newer("2.8.0"):
        return

    from vllm.model_executor import BasevLLMParameter
    parent_class = BasevLLMParameter.__mro__[1]
    parent_torch_function = getattr(parent_class, "__torch_function__", None)

    def torch_function(origin_cls, func, types, args=(), kwargs=None):
        if kwargs is None:
            kwargs = {}
        if parent_torch_function is None:
            return NotImplemented
        return parent_torch_function(func, types, args, kwargs)

    BasevLLMParameter.__torch_function__ = staticmethod(torch_function)  # type: ignore[assignment]
    return

set_device classmethod

set_device(device: device) -> None

Set the device for the current platform.

Source code in vllm_gaudi/platform.py
@classmethod
def set_device(cls, device: torch.device) -> None:
    """
    Set the device for the current platform.
    """
    return

set_torch_compile classmethod

set_torch_compile() -> None
Source code in vllm_gaudi/platform.py
@classmethod
def set_torch_compile(cls) -> None:
    # NOTE: PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
    # does not support torch.compile
    # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
    # torch.compile support

    # PT_HPU_WEIGHT_SHARING=0 is needed in both lazy and eager modes.
    # Only set if not already provided by the user.
    if os.environ.get('PT_HPU_WEIGHT_SHARING') is None:
        os.environ['PT_HPU_WEIGHT_SHARING'] = '0'
    is_lazy = htorch.utils.internal.is_lazy()
    if is_lazy:
        torch._dynamo.config.disable = True
        # NOTE multi-HPU inference with HPUGraphs (lazy-only)
        # requires enabling lazy collectives
        # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html  # noqa: E501
        os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
        # Remove eager-mode-only env vars that were auto-set by a prior
        # set_torch_compile() call (e.g. in a parent pytest process
        # that loaded vllm_gaudi as a plugin in eager mode).
        # User-explicitly-set values are left untouched.
        if os.environ.pop(cls._MARKER_RUNTIME_SCALE_PATCHING, None):
            os.environ.pop('RUNTIME_SCALE_PATCHING', None)
            logger.info("Removed inherited RUNTIME_SCALE_PATCHING "
                        "(auto-set by parent process in eager mode)")
        if os.environ.pop(cls._MARKER_FUSER_MULTI_THREADED, None):
            os.environ.pop('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS', None)
            logger.info("Removed inherited "
                        "FUSER_ENABLE_MULTI_THREADED_INVOCATIONS "
                        "(auto-set by parent process in eager mode)")
    else:
        # If not set by user then for torch compile enable Runtime scale patching by default
        if os.environ.get('RUNTIME_SCALE_PATCHING') is None:
            os.environ['RUNTIME_SCALE_PATCHING'] = '1'
            os.environ[cls._MARKER_RUNTIME_SCALE_PATCHING] = '1'
        #This allows for utilization of Parallel Compilation feature
        if os.environ.get('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS') is None:
            os.environ['FUSER_ENABLE_MULTI_THREADED_INVOCATIONS'] = '1'
            os.environ[cls._MARKER_FUSER_MULTI_THREADED] = '1'

support_hybrid_kv_cache classmethod

support_hybrid_kv_cache() -> bool
Source code in vllm_gaudi/platform.py
@classmethod
def support_hybrid_kv_cache(cls) -> bool:
    return True

supports_structured_output classmethod

supports_structured_output() -> bool
Source code in vllm_gaudi/platform.py
@classmethod
def supports_structured_output(cls) -> bool:
    return True

supports_v1 classmethod

supports_v1(model_config: ModelConfig) -> bool
Source code in vllm_gaudi/platform.py
@classmethod
def supports_v1(cls, model_config: ModelConfig) -> bool:
    # V1 support on HPU is experimental
    return True

swap_out_blocks_to_host classmethod

swap_out_blocks_to_host(
    src_cache: Union[tuple[Tensor], Tensor],
    dst_cache: Tensor,
    src_block_indices: Tensor,
    dst_block_indices: Tensor,
) -> None

Copy blocks from HPU to host (CPU).

Source code in vllm_gaudi/platform.py
@classmethod
def swap_out_blocks_to_host(
    cls,
    src_cache: Union[tuple[torch.Tensor], torch.Tensor],
    dst_cache: torch.Tensor,
    src_block_indices: torch.Tensor,
    dst_block_indices: torch.Tensor,
) -> None:
    """Copy blocks from HPU to host (CPU)."""
    if isinstance(src_cache, tuple):
        _src_cache = torch.stack([c[src_block_indices] for c in src_cache], dim=0)
        dst_cache[:, dst_block_indices] = _src_cache.cpu()
    else:
        dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()

update_block_size_for_backend classmethod

update_block_size_for_backend(
    vllm_config: VllmConfig,
) -> None
Source code in vllm_gaudi/platform.py
@classmethod
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:

    cache_config = vllm_config.cache_config
    model_config = vllm_config.model_config

    # For Granite 4.0-H (granitemoehybrid), we compute the correct
    # block_size in this method using the PC-aware alignment formula
    # (528 without prefix caching, 768 with prefix caching).
    # We set block_size before calling super and mark it as
    # user-specified so Phase 1 preserves it; Phase 2
    # (_align_hybrid_block_size) then validates and sets
    # mamba_page_size_padded.
    is_granite_hybrid = (model_config is not None
                         and getattr(model_config.hf_config, "model_type", None) == "granitemoehybrid")
    if is_granite_hybrid:
        # Compute the correct block_size using the PC-aware formula.
        from vllm.utils.math_utils import cdiv
        from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
        from vllm.model_executor.models import ModelRegistry
        if cache_config.cache_dtype == "auto":
            kv_dtype = model_config.dtype
        else:
            from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
            kv_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
        attn_1tok = FullAttentionSpec(
            block_size=1,
            num_kv_heads=model_config.get_num_kv_heads(vllm_config.parallel_config),
            head_size=model_config.get_head_size(),
            dtype=kv_dtype,
        ).page_size_bytes
        model_cls, _ = ModelRegistry.resolve_model_cls(
            model_config.architecture,
            model_config=model_config,
        )
        mamba_page_size = MambaSpec(
            shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
            dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
            block_size=-1,
        ).page_size_bytes
        if mamba_page_size > 0:
            if cache_config.enable_prefix_caching:
                mamba_chunk_size = getattr(model_config.hf_config, 'mamba_d_chunk', 256)
                alignment = mamba_chunk_size
            else:
                alignment = 16
            attn_block_size = alignment * cdiv(mamba_page_size, alignment * attn_1tok)
            cache_config.block_size = attn_block_size
            if cache_config.mamba_cache_mode == "align":
                cache_config.mamba_block_size = attn_block_size
            logger.info(
                "Setting granitemoehybrid block_size to %d tokens "
                "(alignment=%d, mamba_page_size=%d bytes, "
                "prefix_caching=%s).",
                attn_block_size,
                alignment,
                mamba_page_size,
                cache_config.enable_prefix_caching,
            )
        if not cache_config.user_specified_block_size:
            cache_config.user_specified_block_size = True
            super().update_block_size_for_backend(vllm_config)
            cache_config.user_specified_block_size = False
        else:
            super().update_block_size_for_backend(vllm_config)
    else:
        super().update_block_size_for_backend(vllm_config)

use_sync_weight_loader classmethod

use_sync_weight_loader() -> bool

Returns if the current platform needs to sync weight loader.

Source code in vllm_gaudi/platform.py
@classmethod
def use_sync_weight_loader(cls) -> bool:
    """
    Returns if the current platform needs to sync weight loader.
    """
    force_sync = os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC", "true").lower() in ("true", "1")
    return force_sync

is_qwen3_5_hybrid_model

is_qwen3_5_hybrid_model(
    model_config: Optional[ModelConfig],
) -> bool
Source code in vllm_gaudi/platform.py
def is_qwen3_5_hybrid_model(model_config: Optional[ModelConfig]) -> bool:
    if model_config is None or not model_config.is_hybrid:
        return False

    architectures = set(getattr(getattr(model_config, "hf_config", None), "architectures", []) or [])
    architecture = getattr(model_config, "architecture", None)
    if architecture is not None:
        architectures.add(architecture)

    return any(arch in QWEN3_5_HYBRID_ARCHS for arch in architectures)

retain_envs

retain_envs(var_name)
Source code in vllm_gaudi/platform.py
def retain_envs(var_name):
    retain_var_list = ['GLOO_SOCKET_IFNAME', 'HCCL_SOCKET_IFNAME', 'NCCL_SOCKET_IFNAME']
    return ('HPU' in var_name or 'RAY' in var_name or 'VLLM' in var_name or var_name in retain_var_list)