Skip to content

vllm.platforms

Modules:

Name Description
cpu
cuda

Code inside this file can safely assume cuda platform, e.g. importing

hpu
interface
neuron
rocm
tpu
xpu

__all__ module-attribute

__all__ = [
    "Platform",
    "PlatformEnum",
    "current_platform",
    "CpuArchEnum",
    "_init_trace",
]

_current_platform module-attribute

_current_platform = None

_init_trace module-attribute

_init_trace: str = ''

builtin_platform_plugins module-attribute

builtin_platform_plugins = {
    "tpu": tpu_platform_plugin,
    "cuda": cuda_platform_plugin,
    "rocm": rocm_platform_plugin,
    "hpu": hpu_platform_plugin,
    "xpu": xpu_platform_plugin,
    "cpu": cpu_platform_plugin,
    "neuron": neuron_platform_plugin,
}

current_platform module-attribute

current_platform: Platform

logger module-attribute

logger = getLogger(__name__)

CpuArchEnum

Bases: Enum

Source code in vllm/platforms/interface.py
class CpuArchEnum(enum.Enum):
    X86 = enum.auto()
    ARM = enum.auto()
    POWERPC = enum.auto()
    OTHER = enum.auto()
    UNKNOWN = enum.auto()

ARM class-attribute instance-attribute

ARM = auto()

OTHER class-attribute instance-attribute

OTHER = auto()

POWERPC class-attribute instance-attribute

POWERPC = auto()

UNKNOWN class-attribute instance-attribute

UNKNOWN = auto()

X86 class-attribute instance-attribute

X86 = auto()

Platform

Source code in vllm/platforms/interface.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
class Platform:
    _enum: PlatformEnum
    device_name: str
    device_type: str

    # available dispatch keys:
    # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
    # use "CPU" as a fallback for platforms not registered in PyTorch
    dispatch_key: str = "CPU"

    # available ray device keys:
    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
    # empty string means the device does not support ray
    ray_device_key: str = ""

    # platform-agnostic way to specify the device control environment variable,
    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
    # hint: search for "get_visible_accelerator_ids_env_var" in
    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
    device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"

    # The torch.compile backend for compiling simple and
    # standalone functions. The default value is "inductor" to keep
    # the same behavior as PyTorch.
    # NOTE: for the forward part of the model, vLLM has another separate
    # compilation strategy.
    simple_compile_backend: str = "inductor"

    supported_quantization: list[str] = []

    additional_env_vars: list[str] = []

    @property
    def supported_dtypes(self) -> list[torch.dtype]:
        """Returns the supported dtypes for the current platform."""
        # Be careful with the order of the dtypes. The first dtype will
        # be used as the default dtype fallback for the current platform,
        # when encountering unsupported dtypes in "auto" dtype.
        return [torch.bfloat16, torch.float16, torch.float32]

    def is_cuda(self) -> bool:
        return self._enum == PlatformEnum.CUDA

    def is_rocm(self) -> bool:
        return self._enum == PlatformEnum.ROCM

    def is_tpu(self) -> bool:
        return self._enum == PlatformEnum.TPU

    def is_hpu(self) -> bool:
        return self._enum == PlatformEnum.HPU

    def is_xpu(self) -> bool:
        return self._enum == PlatformEnum.XPU

    def is_cpu(self) -> bool:
        return self._enum == PlatformEnum.CPU

    def is_neuron(self) -> bool:
        return self._enum == PlatformEnum.NEURON

    def is_out_of_tree(self) -> bool:
        return self._enum == PlatformEnum.OOT

    def is_cuda_alike(self) -> bool:
        """Stateless version of [torch.cuda.is_available][]."""
        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

    def is_sleep_mode_available(self) -> bool:
        return self._enum == PlatformEnum.CUDA

    @classmethod
    def device_id_to_physical_device_id(cls, device_id: int):
        if cls.device_control_env_var in os.environ:
            device_ids = os.environ[cls.device_control_env_var].split(",")
            if device_ids == [""]:
                msg = (f"{cls.device_control_env_var} is set to empty string, "
                       "which means current platform support is disabled. If "
                       "you are using ray, please unset the environment "
                       f"variable `{cls.device_control_env_var}` inside the "
                       "worker/actor. Check "
                       "https://github.com/vllm-project/vllm/issues/8402 for "
                       "more information.")
                raise RuntimeError(msg)
            physical_device_id = device_ids[device_id]
            return int(physical_device_id)
        else:
            return device_id

    @classmethod
    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
                             block_size: int, use_v1: bool,
                             use_mla: bool) -> str:
        """Get the attention backend class of a device."""
        return ""

    @classmethod
    def get_device_capability(
        cls,
        device_id: int = 0,
    ) -> Optional[DeviceCapability]:
        """Stateless version of [torch.cuda.get_device_capability][]."""
        return None

    @classmethod
    def has_device_capability(
        cls,
        capability: Union[tuple[int, int], int],
        device_id: int = 0,
    ) -> bool:
        """
        Test whether this platform is compatible with a device capability.

        The `capability` argument can either be:

        - A tuple `(major, minor)`.
        - An integer `<major><minor>`. (See
        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
        """
        current_capability = cls.get_device_capability(device_id=device_id)
        if current_capability is None:
            return False

        if isinstance(capability, tuple):
            return current_capability >= capability

        return current_capability.to_int() >= capability

    @classmethod
    def is_device_capability(
        cls,
        capability: Union[tuple[int, int], int],
        device_id: int = 0,
    ) -> bool:
        """
        Test whether this platform has exactly the specified device capability.

        The `capability` argument can either be:

        - A tuple `(major, minor)`.
        - An integer `<major><minor>`. (See
        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
        """
        current_capability = cls.get_device_capability(device_id=device_id)
        if current_capability is None:
            return False

        if isinstance(capability, tuple):
            return current_capability == capability

        return current_capability.to_int() == capability

    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        """Get the name of a device."""
        raise NotImplementedError

    @classmethod
    def get_device_uuid(cls, device_id: int = 0) -> str:
        """Get the uuid of a device, e.g. the PCI bus ID."""
        raise NotImplementedError

    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        """Get the total memory of a device in bytes."""
        raise NotImplementedError

    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        """
        Check if the current platform supports async output.
        """
        raise NotImplementedError

    @classmethod
    def inference_mode(cls):
        """A device-specific wrapper of `torch.inference_mode`.

        This wrapper is recommended because some hardware backends such as TPU
        do not support `torch.inference_mode`. In such a case, they will fall
        back to `torch.no_grad` by overriding this method.
        """
        return torch.inference_mode(mode=True)

    @classmethod
    def seed_everything(cls, seed: Optional[int] = None) -> None:
        """
        Set the seed of each random module.
        `torch.manual_seed` will set seed on all devices.

        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
        """
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)

    @classmethod
    def set_device(cls, device: torch.device) -> None:
        """
        Set the device for the current platform.
        """
        torch.cuda.set_device(device)

    @classmethod
    def pre_register_and_update(cls,
                                parser: Optional[FlexibleArgumentParser] = None
                                ) -> None:
        """
        Do some pre-registration or update action for the current platform.

        This function is called before global VllmConfig is initialized or cli
        arguments are parsed. It's used for out-of-tree platforms to register or
        update the configuration.

        For example, the out-of-tree quantization config can be imported and
        registered here dynamically.
        """
        pass

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        """
        Check and update the configuration for the current platform.

        It can raise an exception if the configuration is not compatible with
        the current platform, or it can update the configuration to make it
        compatible with the current platform.

        The config is passed by reference, so it can be modified in place.
        """
        pass

    @classmethod
    def verify_model_arch(cls, model_arch: str) -> None:
        """
        Verify whether the current platform supports the specified model
        architecture.

        - This will raise an Error or Warning based on the model support on
        the current platform.
        - By default all models are considered supported.
        """
        pass

    @classmethod
    def verify_quantization(cls, quant: str) -> None:
        """
        Verify whether the quantization is supported by the current platform.
        """
        if cls.supported_quantization and \
            quant not in cls.supported_quantization:
            raise ValueError(
                f"{quant} quantization is currently not supported in "
                f"{cls.device_name}.")

    @classmethod
    def get_cpu_architecture(cls) -> CpuArchEnum:
        """
        Determine the CPU architecture of the current system.
        Returns CpuArchEnum indicating the architecture type.
        """
        machine = platform.machine().lower()

        if machine in ("x86_64", "amd64", "i386", "i686"):
            return CpuArchEnum.X86
        elif machine.startswith("arm") or machine.startswith("aarch"):
            return CpuArchEnum.ARM
        elif machine.startswith("ppc"):
            return CpuArchEnum.POWERPC

        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN

    @classmethod
    def is_pin_memory_available(cls) -> bool:
        """Checks whether pin memory is available on the current platform."""
        if in_wsl():
            # Pinning memory in WSL is not supported.
            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
            logger.warning("Using 'pin_memory=False' as WSL is detected. "
                           "This may slow down the performance.")
            return False
        return True

    @classmethod
    def get_current_memory_usage(cls,
                                 device: Optional[torch.types.Device] = None
                                 ) -> float:
        """
        Return the memory usage in bytes.
        """
        raise NotImplementedError

    @classmethod
    def get_punica_wrapper(cls) -> str:
        """
        Return the punica wrapper for current platform.
        """
        raise NotImplementedError

    @classmethod
    def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
        """
        Return the platform specific values for (-inf, inf)
        """
        return float("-inf"), float("inf")

    @classmethod
    def can_update_inplace(cls) -> bool:
        """
        Checks if the platform allows inplace memory updates
        """
        return True

    @classmethod
    def get_lora_vocab_padding_size(cls) -> int:
        """
        Returns how much padding the LoRA logits need for kernels
        """
        return 256

    @classmethod
    def get_device_communicator_cls(cls) -> str:
        """
        Get device specific communicator class for distributed communication.
        """
        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa

    @classmethod
    def supports_mx(cls) -> bool:
        """
        Returns whether the current platform supports MX types.
        """
        return False

    @classmethod
    def supports_fp8(cls) -> bool:
        """
        Returns whether the current platform supports FP8 types.
        """
        return False

    @classmethod
    def is_fp8_fnuz(cls) -> bool:
        """
        Returns whether the preferred FP8 type is FNUZ on the current platform.

        There are two representations of FP8, OCP FP8 and FNUZ FP8.
        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.

        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
        hardware has converged on the OCP FP8 standard.
        """
        return False

    @classmethod
    def fp8_dtype(cls) -> torch.dtype:
        """
        Returns the preferred FP8 type on the current platform.

        See the documentation for is_fp8_fnuz for details.
        """
        return torch.float8_e4m3fn

    @classmethod
    def use_all_gather(cls) -> bool:
        """
        Whether to use allgather in LogitsProcessor to gather the logits.
        """
        import vllm.envs as envs
        from vllm.config import get_current_vllm_config

        parallel_config = get_current_vllm_config().parallel_config
        return (envs.VLLM_USE_V1
                or parallel_config.distributed_executor_backend
                == "external_launcher")

    @classmethod
    def supports_v1(cls, model_config: ModelConfig) -> bool:
        """Returns whether the current platform can support v1 for the supplied
        model configuration.
        """
        return False

    @classmethod
    def default_v1(cls, model_config: ModelConfig) -> bool:
        """
        Returns whether the current platform supports v1 by default.
        """
        return cls.supports_v1(model_config)

    @classmethod
    def use_custom_allreduce(cls) -> bool:
        """
        Returns if custom allreduce is supported on the current platform
        """
        return False

    @classmethod
    def validate_request(
        cls,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
        processed_inputs: ProcessorInputs,
    ) -> None:
        """Raises if this request is unsupported on this platform"""

    def __getattr__(self, key: str):
        device = getattr(torch, self.device_type, None)
        if device is not None and hasattr(device, key):
            return getattr(device, key)
        else:
            logger.warning("Current platform %s does not have '%s'" \
            " attribute.", self.device_type, key)
            return None

    @classmethod
    def get_cu_count(cls, device_id: int = 0) -> int:
        """
        Returns the total number of compute units (CU) on single GPU.
        """
        raise NotImplementedError

    @classmethod
    def get_piecewise_backend_cls(cls) -> str:
        """
        Get piecewise backend class for piecewise graph.
        """
        return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa

    @classmethod
    def stateless_init_device_torch_dist_pg(
        cls,
        backend: str,
        prefix_store: PrefixStore,
        group_rank: int,
        group_size: int,
        timeout: timedelta,
    ) -> ProcessGroup:
        """
        Init platform-specific torch distributed process group.
        """
        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")

_enum instance-attribute

_enum: PlatformEnum

additional_env_vars class-attribute instance-attribute

additional_env_vars: list[str] = []

device_control_env_var class-attribute instance-attribute

device_control_env_var: str = (
    "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
)

device_name instance-attribute

device_name: str

device_type instance-attribute

device_type: str

dispatch_key class-attribute instance-attribute

dispatch_key: str = 'CPU'

ray_device_key class-attribute instance-attribute

ray_device_key: str = ''

simple_compile_backend class-attribute instance-attribute

simple_compile_backend: str = 'inductor'

supported_dtypes property

supported_dtypes: list[dtype]

Returns the supported dtypes for the current platform.

supported_quantization class-attribute instance-attribute

supported_quantization: list[str] = []

__getattr__

__getattr__(key: str)
Source code in vllm/platforms/interface.py
def __getattr__(self, key: str):
    device = getattr(torch, self.device_type, None)
    if device is not None and hasattr(device, key):
        return getattr(device, key)
    else:
        logger.warning("Current platform %s does not have '%s'" \
        " attribute.", self.device_type, key)
        return None

can_update_inplace classmethod

can_update_inplace() -> bool

Checks if the platform allows inplace memory updates

Source code in vllm/platforms/interface.py
@classmethod
def can_update_inplace(cls) -> bool:
    """
    Checks if the platform allows inplace memory updates
    """
    return True

check_and_update_config classmethod

check_and_update_config(vllm_config: VllmConfig) -> None

Check and update the configuration for the current platform.

It can raise an exception if the configuration is not compatible with the current platform, or it can update the configuration to make it compatible with the current platform.

The config is passed by reference, so it can be modified in place.

Source code in vllm/platforms/interface.py
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
    """
    Check and update the configuration for the current platform.

    It can raise an exception if the configuration is not compatible with
    the current platform, or it can update the configuration to make it
    compatible with the current platform.

    The config is passed by reference, so it can be modified in place.
    """
    pass

default_v1 classmethod

default_v1(model_config: ModelConfig) -> bool

Returns whether the current platform supports v1 by default.

Source code in vllm/platforms/interface.py
@classmethod
def default_v1(cls, model_config: ModelConfig) -> bool:
    """
    Returns whether the current platform supports v1 by default.
    """
    return cls.supports_v1(model_config)

device_id_to_physical_device_id classmethod

device_id_to_physical_device_id(device_id: int)
Source code in vllm/platforms/interface.py
@classmethod
def device_id_to_physical_device_id(cls, device_id: int):
    if cls.device_control_env_var in os.environ:
        device_ids = os.environ[cls.device_control_env_var].split(",")
        if device_ids == [""]:
            msg = (f"{cls.device_control_env_var} is set to empty string, "
                   "which means current platform support is disabled. If "
                   "you are using ray, please unset the environment "
                   f"variable `{cls.device_control_env_var}` inside the "
                   "worker/actor. Check "
                   "https://github.com/vllm-project/vllm/issues/8402 for "
                   "more information.")
            raise RuntimeError(msg)
        physical_device_id = device_ids[device_id]
        return int(physical_device_id)
    else:
        return device_id

fp8_dtype classmethod

fp8_dtype() -> dtype

Returns the preferred FP8 type on the current platform.

See the documentation for is_fp8_fnuz for details.

Source code in vllm/platforms/interface.py
@classmethod
def fp8_dtype(cls) -> torch.dtype:
    """
    Returns the preferred FP8 type on the current platform.

    See the documentation for is_fp8_fnuz for details.
    """
    return torch.float8_e4m3fn

get_attn_backend_cls classmethod

get_attn_backend_cls(
    selected_backend: _Backend,
    head_size: int,
    dtype: dtype,
    kv_cache_dtype: Optional[str],
    block_size: int,
    use_v1: bool,
    use_mla: bool,
) -> str

Get the attention backend class of a device.

Source code in vllm/platforms/interface.py
@classmethod
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                         dtype: torch.dtype, kv_cache_dtype: Optional[str],
                         block_size: int, use_v1: bool,
                         use_mla: bool) -> str:
    """Get the attention backend class of a device."""
    return ""

get_cpu_architecture classmethod

get_cpu_architecture() -> CpuArchEnum

Determine the CPU architecture of the current system. Returns CpuArchEnum indicating the architecture type.

Source code in vllm/platforms/interface.py
@classmethod
def get_cpu_architecture(cls) -> CpuArchEnum:
    """
    Determine the CPU architecture of the current system.
    Returns CpuArchEnum indicating the architecture type.
    """
    machine = platform.machine().lower()

    if machine in ("x86_64", "amd64", "i386", "i686"):
        return CpuArchEnum.X86
    elif machine.startswith("arm") or machine.startswith("aarch"):
        return CpuArchEnum.ARM
    elif machine.startswith("ppc"):
        return CpuArchEnum.POWERPC

    return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN

get_cu_count classmethod

get_cu_count(device_id: int = 0) -> int

Returns the total number of compute units (CU) on single GPU.

Source code in vllm/platforms/interface.py
@classmethod
def get_cu_count(cls, device_id: int = 0) -> int:
    """
    Returns the total number of compute units (CU) on single GPU.
    """
    raise NotImplementedError

get_current_memory_usage classmethod

get_current_memory_usage(
    device: Optional[Device] = None,
) -> float

Return the memory usage in bytes.

Source code in vllm/platforms/interface.py
@classmethod
def get_current_memory_usage(cls,
                             device: Optional[torch.types.Device] = None
                             ) -> float:
    """
    Return the memory usage in bytes.
    """
    raise NotImplementedError

get_device_capability classmethod

get_device_capability(
    device_id: int = 0,
) -> Optional[DeviceCapability]

Stateless version of torch.cuda.get_device_capability.

Source code in vllm/platforms/interface.py
@classmethod
def get_device_capability(
    cls,
    device_id: int = 0,
) -> Optional[DeviceCapability]:
    """Stateless version of [torch.cuda.get_device_capability][]."""
    return None

get_device_communicator_cls classmethod

get_device_communicator_cls() -> str

Get device specific communicator class for distributed communication.

Source code in vllm/platforms/interface.py
@classmethod
def get_device_communicator_cls(cls) -> str:
    """
    Get device specific communicator class for distributed communication.
    """
    return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa

get_device_name classmethod

get_device_name(device_id: int = 0) -> str

Get the name of a device.

Source code in vllm/platforms/interface.py
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
    """Get the name of a device."""
    raise NotImplementedError

get_device_total_memory classmethod

get_device_total_memory(device_id: int = 0) -> int

Get the total memory of a device in bytes.

Source code in vllm/platforms/interface.py
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
    """Get the total memory of a device in bytes."""
    raise NotImplementedError

get_device_uuid classmethod

get_device_uuid(device_id: int = 0) -> str

Get the uuid of a device, e.g. the PCI bus ID.

Source code in vllm/platforms/interface.py
@classmethod
def get_device_uuid(cls, device_id: int = 0) -> str:
    """Get the uuid of a device, e.g. the PCI bus ID."""
    raise NotImplementedError

get_infinity_values classmethod

get_infinity_values(dtype: dtype) -> tuple[float, float]

Return the platform specific values for (-inf, inf)

Source code in vllm/platforms/interface.py
@classmethod
def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
    """
    Return the platform specific values for (-inf, inf)
    """
    return float("-inf"), float("inf")

get_lora_vocab_padding_size classmethod

get_lora_vocab_padding_size() -> int

Returns how much padding the LoRA logits need for kernels

Source code in vllm/platforms/interface.py
@classmethod
def get_lora_vocab_padding_size(cls) -> int:
    """
    Returns how much padding the LoRA logits need for kernels
    """
    return 256

get_piecewise_backend_cls classmethod

get_piecewise_backend_cls() -> str

Get piecewise backend class for piecewise graph.

Source code in vllm/platforms/interface.py
@classmethod
def get_piecewise_backend_cls(cls) -> str:
    """
    Get piecewise backend class for piecewise graph.
    """
    return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa

get_punica_wrapper classmethod

get_punica_wrapper() -> str

Return the punica wrapper for current platform.

Source code in vllm/platforms/interface.py
@classmethod
def get_punica_wrapper(cls) -> str:
    """
    Return the punica wrapper for current platform.
    """
    raise NotImplementedError

has_device_capability classmethod

has_device_capability(
    capability: Union[tuple[int, int], int],
    device_id: int = 0,
) -> bool

Test whether this platform is compatible with a device capability.

The capability argument can either be:

Source code in vllm/platforms/interface.py
@classmethod
def has_device_capability(
    cls,
    capability: Union[tuple[int, int], int],
    device_id: int = 0,
) -> bool:
    """
    Test whether this platform is compatible with a device capability.

    The `capability` argument can either be:

    - A tuple `(major, minor)`.
    - An integer `<major><minor>`. (See
    [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
    """
    current_capability = cls.get_device_capability(device_id=device_id)
    if current_capability is None:
        return False

    if isinstance(capability, tuple):
        return current_capability >= capability

    return current_capability.to_int() >= capability

inference_mode classmethod

inference_mode()

A device-specific wrapper of torch.inference_mode.

This wrapper is recommended because some hardware backends such as TPU do not support torch.inference_mode. In such a case, they will fall back to torch.no_grad by overriding this method.

Source code in vllm/platforms/interface.py
@classmethod
def inference_mode(cls):
    """A device-specific wrapper of `torch.inference_mode`.

    This wrapper is recommended because some hardware backends such as TPU
    do not support `torch.inference_mode`. In such a case, they will fall
    back to `torch.no_grad` by overriding this method.
    """
    return torch.inference_mode(mode=True)

is_async_output_supported classmethod

is_async_output_supported(
    enforce_eager: Optional[bool],
) -> bool

Check if the current platform supports async output.

Source code in vllm/platforms/interface.py
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
    """
    Check if the current platform supports async output.
    """
    raise NotImplementedError

is_cpu

is_cpu() -> bool
Source code in vllm/platforms/interface.py
def is_cpu(self) -> bool:
    return self._enum == PlatformEnum.CPU

is_cuda

is_cuda() -> bool
Source code in vllm/platforms/interface.py
def is_cuda(self) -> bool:
    return self._enum == PlatformEnum.CUDA

is_cuda_alike

is_cuda_alike() -> bool

Stateless version of torch.cuda.is_available.

Source code in vllm/platforms/interface.py
def is_cuda_alike(self) -> bool:
    """Stateless version of [torch.cuda.is_available][]."""
    return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

is_device_capability classmethod

is_device_capability(
    capability: Union[tuple[int, int], int],
    device_id: int = 0,
) -> bool

Test whether this platform has exactly the specified device capability.

The capability argument can either be:

Source code in vllm/platforms/interface.py
@classmethod
def is_device_capability(
    cls,
    capability: Union[tuple[int, int], int],
    device_id: int = 0,
) -> bool:
    """
    Test whether this platform has exactly the specified device capability.

    The `capability` argument can either be:

    - A tuple `(major, minor)`.
    - An integer `<major><minor>`. (See
    [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
    """
    current_capability = cls.get_device_capability(device_id=device_id)
    if current_capability is None:
        return False

    if isinstance(capability, tuple):
        return current_capability == capability

    return current_capability.to_int() == capability

is_fp8_fnuz classmethod

is_fp8_fnuz() -> bool

Returns whether the preferred FP8 type is FNUZ on the current platform.

There are two representations of FP8, OCP FP8 and FNUZ FP8. The OCP specification can be found at https://tinyurl.com/b7jvwpft. The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.

AMD's MI300 and MI325 have native hardware support for FNUZ. All other hardware has converged on the OCP FP8 standard.

Source code in vllm/platforms/interface.py
@classmethod
def is_fp8_fnuz(cls) -> bool:
    """
    Returns whether the preferred FP8 type is FNUZ on the current platform.

    There are two representations of FP8, OCP FP8 and FNUZ FP8.
    The OCP specification can be found at https://tinyurl.com/b7jvwpft.
    The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.

    AMD's MI300 and MI325 have native hardware support for FNUZ. All other
    hardware has converged on the OCP FP8 standard.
    """
    return False

is_hpu

is_hpu() -> bool
Source code in vllm/platforms/interface.py
def is_hpu(self) -> bool:
    return self._enum == PlatformEnum.HPU

is_neuron

is_neuron() -> bool
Source code in vllm/platforms/interface.py
def is_neuron(self) -> bool:
    return self._enum == PlatformEnum.NEURON

is_out_of_tree

is_out_of_tree() -> bool
Source code in vllm/platforms/interface.py
def is_out_of_tree(self) -> bool:
    return self._enum == PlatformEnum.OOT

is_pin_memory_available classmethod

is_pin_memory_available() -> bool

Checks whether pin memory is available on the current platform.

Source code in vllm/platforms/interface.py
@classmethod
def is_pin_memory_available(cls) -> bool:
    """Checks whether pin memory is available on the current platform."""
    if in_wsl():
        # Pinning memory in WSL is not supported.
        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
        logger.warning("Using 'pin_memory=False' as WSL is detected. "
                       "This may slow down the performance.")
        return False
    return True

is_rocm

is_rocm() -> bool
Source code in vllm/platforms/interface.py
def is_rocm(self) -> bool:
    return self._enum == PlatformEnum.ROCM

is_sleep_mode_available

is_sleep_mode_available() -> bool
Source code in vllm/platforms/interface.py
def is_sleep_mode_available(self) -> bool:
    return self._enum == PlatformEnum.CUDA

is_tpu

is_tpu() -> bool
Source code in vllm/platforms/interface.py
def is_tpu(self) -> bool:
    return self._enum == PlatformEnum.TPU

is_xpu

is_xpu() -> bool
Source code in vllm/platforms/interface.py
def is_xpu(self) -> bool:
    return self._enum == PlatformEnum.XPU

pre_register_and_update classmethod

pre_register_and_update(
    parser: Optional[FlexibleArgumentParser] = None,
) -> None

Do some pre-registration or update action for the current platform.

This function is called before global VllmConfig is initialized or cli arguments are parsed. It's used for out-of-tree platforms to register or update the configuration.

For example, the out-of-tree quantization config can be imported and registered here dynamically.

Source code in vllm/platforms/interface.py
@classmethod
def pre_register_and_update(cls,
                            parser: Optional[FlexibleArgumentParser] = None
                            ) -> None:
    """
    Do some pre-registration or update action for the current platform.

    This function is called before global VllmConfig is initialized or cli
    arguments are parsed. It's used for out-of-tree platforms to register or
    update the configuration.

    For example, the out-of-tree quantization config can be imported and
    registered here dynamically.
    """
    pass

seed_everything classmethod

seed_everything(seed: Optional[int] = None) -> None

Set the seed of each random module. torch.manual_seed will set seed on all devices.

Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20

Source code in vllm/platforms/interface.py
@classmethod
def seed_everything(cls, seed: Optional[int] = None) -> None:
    """
    Set the seed of each random module.
    `torch.manual_seed` will set seed on all devices.

    Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
    """
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

set_device classmethod

set_device(device: device) -> None

Set the device for the current platform.

Source code in vllm/platforms/interface.py
@classmethod
def set_device(cls, device: torch.device) -> None:
    """
    Set the device for the current platform.
    """
    torch.cuda.set_device(device)

stateless_init_device_torch_dist_pg classmethod

stateless_init_device_torch_dist_pg(
    backend: str,
    prefix_store: PrefixStore,
    group_rank: int,
    group_size: int,
    timeout: timedelta,
) -> ProcessGroup

Init platform-specific torch distributed process group.

Source code in vllm/platforms/interface.py
@classmethod
def stateless_init_device_torch_dist_pg(
    cls,
    backend: str,
    prefix_store: PrefixStore,
    group_rank: int,
    group_size: int,
    timeout: timedelta,
) -> ProcessGroup:
    """
    Init platform-specific torch distributed process group.
    """
    raise RuntimeError(f"Unsupported torch distributed backend: {backend}")

supports_fp8 classmethod

supports_fp8() -> bool

Returns whether the current platform supports FP8 types.

Source code in vllm/platforms/interface.py
@classmethod
def supports_fp8(cls) -> bool:
    """
    Returns whether the current platform supports FP8 types.
    """
    return False

supports_mx classmethod

supports_mx() -> bool

Returns whether the current platform supports MX types.

Source code in vllm/platforms/interface.py
@classmethod
def supports_mx(cls) -> bool:
    """
    Returns whether the current platform supports MX types.
    """
    return False

supports_v1 classmethod

supports_v1(model_config: ModelConfig) -> bool

Returns whether the current platform can support v1 for the supplied model configuration.

Source code in vllm/platforms/interface.py
@classmethod
def supports_v1(cls, model_config: ModelConfig) -> bool:
    """Returns whether the current platform can support v1 for the supplied
    model configuration.
    """
    return False

use_all_gather classmethod

use_all_gather() -> bool

Whether to use allgather in LogitsProcessor to gather the logits.

Source code in vllm/platforms/interface.py
@classmethod
def use_all_gather(cls) -> bool:
    """
    Whether to use allgather in LogitsProcessor to gather the logits.
    """
    import vllm.envs as envs
    from vllm.config import get_current_vllm_config

    parallel_config = get_current_vllm_config().parallel_config
    return (envs.VLLM_USE_V1
            or parallel_config.distributed_executor_backend
            == "external_launcher")

use_custom_allreduce classmethod

use_custom_allreduce() -> bool

Returns if custom allreduce is supported on the current platform

Source code in vllm/platforms/interface.py
@classmethod
def use_custom_allreduce(cls) -> bool:
    """
    Returns if custom allreduce is supported on the current platform
    """
    return False

validate_request classmethod

validate_request(
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    processed_inputs: ProcessorInputs,
) -> None

Raises if this request is unsupported on this platform

Source code in vllm/platforms/interface.py
@classmethod
def validate_request(
    cls,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    processed_inputs: ProcessorInputs,
) -> None:
    """Raises if this request is unsupported on this platform"""

verify_model_arch classmethod

verify_model_arch(model_arch: str) -> None

Verify whether the current platform supports the specified model architecture.

  • This will raise an Error or Warning based on the model support on the current platform.
  • By default all models are considered supported.
Source code in vllm/platforms/interface.py
@classmethod
def verify_model_arch(cls, model_arch: str) -> None:
    """
    Verify whether the current platform supports the specified model
    architecture.

    - This will raise an Error or Warning based on the model support on
    the current platform.
    - By default all models are considered supported.
    """
    pass

verify_quantization classmethod

verify_quantization(quant: str) -> None

Verify whether the quantization is supported by the current platform.

Source code in vllm/platforms/interface.py
@classmethod
def verify_quantization(cls, quant: str) -> None:
    """
    Verify whether the quantization is supported by the current platform.
    """
    if cls.supported_quantization and \
        quant not in cls.supported_quantization:
        raise ValueError(
            f"{quant} quantization is currently not supported in "
            f"{cls.device_name}.")

PlatformEnum

Bases: Enum

Source code in vllm/platforms/interface.py
class PlatformEnum(enum.Enum):
    CUDA = enum.auto()
    ROCM = enum.auto()
    TPU = enum.auto()
    HPU = enum.auto()
    XPU = enum.auto()
    CPU = enum.auto()
    NEURON = enum.auto()
    OOT = enum.auto()
    UNSPECIFIED = enum.auto()

CPU class-attribute instance-attribute

CPU = auto()

CUDA class-attribute instance-attribute

CUDA = auto()

HPU class-attribute instance-attribute

HPU = auto()

NEURON class-attribute instance-attribute

NEURON = auto()

OOT class-attribute instance-attribute

OOT = auto()

ROCM class-attribute instance-attribute

ROCM = auto()

TPU class-attribute instance-attribute

TPU = auto()

UNSPECIFIED class-attribute instance-attribute

UNSPECIFIED = auto()

XPU class-attribute instance-attribute

XPU = auto()

__getattr__

__getattr__(name: str)
Source code in vllm/platforms/__init__.py
def __getattr__(name: str):
    if name == 'current_platform':
        # lazy init current_platform.
        # 1. out-of-tree platform plugins need `from vllm.platforms import
        #    Platform` so that they can inherit `Platform` class. Therefore,
        #    we cannot resolve `current_platform` during the import of
        #    `vllm.platforms`.
        # 2. when users use out-of-tree platform plugins, they might run
        #    `import vllm`, some vllm internal code might access
        #    `current_platform` during the import, and we need to make sure
        #    `current_platform` is only resolved after the plugins are loaded
        #    (we have tests for this, if any developer violate this, they will
        #    see the test failures).
        global _current_platform
        if _current_platform is None:
            platform_cls_qualname = resolve_current_platform_cls_qualname()
            _current_platform = resolve_obj_by_qualname(
                platform_cls_qualname)()
            global _init_trace
            _init_trace = "".join(traceback.format_stack())
        return _current_platform
    elif name in globals():
        return globals()[name]
    else:
        raise AttributeError(
            f"No attribute named '{name}' exists in {__name__}.")

cpu_platform_plugin

cpu_platform_plugin() -> Optional[str]
Source code in vllm/platforms/__init__.py
def cpu_platform_plugin() -> Optional[str]:
    is_cpu = False
    logger.debug("Checking if CPU platform is available.")
    try:
        is_cpu = vllm_version_matches_substr("cpu")
        if is_cpu:
            logger.debug("Confirmed CPU platform is available because"
                         " vLLM is built with CPU.")
        if not is_cpu:
            import sys
            is_cpu = sys.platform.startswith("darwin")
            if is_cpu:
                logger.debug("Confirmed CPU platform is available"
                             " because the machine is MacOS.")

    except Exception as e:
        logger.debug("CPU platform is not available because: %s", str(e))

    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None

cuda_platform_plugin

cuda_platform_plugin() -> Optional[str]
Source code in vllm/platforms/__init__.py
def cuda_platform_plugin() -> Optional[str]:
    is_cuda = False
    logger.debug("Checking if CUDA platform is available.")
    try:
        from vllm.utils import import_pynvml
        pynvml = import_pynvml()
        pynvml.nvmlInit()
        try:
            # NOTE: Edge case: vllm cpu build on a GPU machine.
            # Third-party pynvml can be imported in cpu build,
            # we need to check if vllm is built with cpu too.
            # Otherwise, vllm will always activate cuda plugin
            # on a GPU machine, even if in a cpu build.
            is_cuda = (pynvml.nvmlDeviceGetCount() > 0
                       and not vllm_version_matches_substr("cpu"))
            if pynvml.nvmlDeviceGetCount() <= 0:
                logger.debug(
                    "CUDA platform is not available because no GPU is found.")
            if vllm_version_matches_substr("cpu"):
                logger.debug("CUDA platform is not available because"
                             " vLLM is built with CPU.")
            if is_cuda:
                logger.debug("Confirmed CUDA platform is available.")
        finally:
            pynvml.nvmlShutdown()
    except Exception as e:
        logger.debug("Exception happens when checking CUDA platform: %s",
                     str(e))
        if "nvml" not in e.__class__.__name__.lower():
            # If the error is not related to NVML, re-raise it.
            raise e

        # CUDA is supported on Jetson, but NVML may not be.
        import os

        def cuda_is_jetson() -> bool:
            return os.path.isfile("/etc/nv_tegra_release") \
                or os.path.exists("/sys/class/tegra-firmware")

        if cuda_is_jetson():
            logger.debug("Confirmed CUDA platform is available on Jetson.")
            is_cuda = True
        else:
            logger.debug("CUDA platform is not available because: %s", str(e))

    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None

hpu_platform_plugin

hpu_platform_plugin() -> Optional[str]
Source code in vllm/platforms/__init__.py
def hpu_platform_plugin() -> Optional[str]:
    is_hpu = False
    logger.debug("Checking if HPU platform is available.")
    try:
        from importlib import util
        is_hpu = util.find_spec('habana_frameworks') is not None
        if is_hpu:
            logger.debug("Confirmed HPU platform is available.")
        else:
            logger.debug("HPU platform is not available because "
                         "habana_frameworks is not found.")
    except Exception as e:
        logger.debug("HPU platform is not available because: %s", str(e))

    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None

neuron_platform_plugin

neuron_platform_plugin() -> Optional[str]
Source code in vllm/platforms/__init__.py
def neuron_platform_plugin() -> Optional[str]:
    tnx_installed = False
    nxd_installed = False
    logger.debug("Checking if Neuron platform is available.")
    try:
        import transformers_neuronx  # noqa: F401
        tnx_installed = True
        logger.debug("Confirmed Neuron platform is available because"
                     " transformers_neuronx is found.")
    except ImportError:
        pass

    try:
        import neuronx_distributed_inference  # noqa: F401
        nxd_installed = True
        logger.debug("Confirmed Neuron platform is available because"
                     " neuronx_distributed_inference is found.")
    except ImportError:
        pass

    is_neuron = tnx_installed or nxd_installed
    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None

resolve_current_platform_cls_qualname

resolve_current_platform_cls_qualname() -> str
Source code in vllm/platforms/__init__.py
def resolve_current_platform_cls_qualname() -> str:
    platform_plugins = load_plugins_by_group('vllm.platform_plugins')

    activated_plugins = []

    for name, func in chain(builtin_platform_plugins.items(),
                            platform_plugins.items()):
        try:
            assert callable(func)
            platform_cls_qualname = func()
            if platform_cls_qualname is not None:
                activated_plugins.append(name)
        except Exception:
            pass

    activated_builtin_plugins = list(
        set(activated_plugins) & set(builtin_platform_plugins.keys()))
    activated_oot_plugins = list(
        set(activated_plugins) & set(platform_plugins.keys()))

    if len(activated_oot_plugins) >= 2:
        raise RuntimeError(
            "Only one platform plugin can be activated, but got: "
            f"{activated_oot_plugins}")
    elif len(activated_oot_plugins) == 1:
        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
        logger.info("Platform plugin %s is activated",
                    activated_oot_plugins[0])
    elif len(activated_builtin_plugins) >= 2:
        raise RuntimeError(
            "Only one platform plugin can be activated, but got: "
            f"{activated_builtin_plugins}")
    elif len(activated_builtin_plugins) == 1:
        platform_cls_qualname = builtin_platform_plugins[
            activated_builtin_plugins[0]]()
        logger.info("Automatically detected platform %s.",
                    activated_builtin_plugins[0])
    else:
        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
        logger.info(
            "No platform detected, vLLM is running on UnspecifiedPlatform")
    return platform_cls_qualname

rocm_platform_plugin

rocm_platform_plugin() -> Optional[str]
Source code in vllm/platforms/__init__.py
def rocm_platform_plugin() -> Optional[str]:
    is_rocm = False
    logger.debug("Checking if ROCm platform is available.")
    try:
        import amdsmi
        amdsmi.amdsmi_init()
        try:
            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
                is_rocm = True
                logger.debug("Confirmed ROCm platform is available.")
            else:
                logger.debug("ROCm platform is not available because"
                             " no GPU is found.")
        finally:
            amdsmi.amdsmi_shut_down()
    except Exception as e:
        logger.debug("ROCm platform is not available because: %s", str(e))

    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None

tpu_platform_plugin

tpu_platform_plugin() -> Optional[str]
Source code in vllm/platforms/__init__.py
def tpu_platform_plugin() -> Optional[str]:
    is_tpu = False
    logger.debug("Checking if TPU platform is available.")
    try:
        # While it's technically possible to install libtpu on a
        # non-TPU machine, this is a very uncommon scenario. Therefore,
        # we assume that libtpu is installed if and only if the machine
        # has TPUs.
        import libtpu  # noqa: F401
        is_tpu = True
        logger.debug("Confirmed TPU platform is available.")
    except Exception as e:
        logger.debug("TPU platform is not available because: %s", str(e))

    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None

vllm_version_matches_substr

vllm_version_matches_substr(substr: str) -> bool

Check to see if the vLLM version matches a substring.

Source code in vllm/platforms/__init__.py
def vllm_version_matches_substr(substr: str) -> bool:
    """
    Check to see if the vLLM version matches a substring.
    """
    from importlib.metadata import PackageNotFoundError, version
    try:
        vllm_version = version("vllm")
    except PackageNotFoundError as e:
        logger.warning(
            "The vLLM package was not found, so its version could not be "
            "inspected. This may cause platform detection to fail.")
        raise e
    return substr in vllm_version

xpu_platform_plugin

xpu_platform_plugin() -> Optional[str]
Source code in vllm/platforms/__init__.py
def xpu_platform_plugin() -> Optional[str]:
    is_xpu = False
    logger.debug("Checking if XPU platform is available.")
    try:
        # installed IPEX if the machine has XPUs.
        import intel_extension_for_pytorch  # noqa: F401
        import oneccl_bindings_for_pytorch  # noqa: F401
        import torch
        if hasattr(torch, 'xpu') and torch.xpu.is_available():
            is_xpu = True
            logger.debug("Confirmed XPU platform is available.")
    except Exception as e:
        logger.debug("XPU platform is not available because: %s", str(e))

    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None