vllm_gaudi.utils ¶

T `module-attribute` ¶

T = TypeVar('T')

U `module-attribute` ¶

U = TypeVar('U')

_prompt_token_stats_cls `module-attribute` ¶

_prompt_token_stats_cls: Any = PromptTokenStats

_stats_get_by_source_orig `module-attribute` ¶

_stats_get_by_source_orig = get_by_source

HPUCompileConfig ¶

Configuration class, which holds arguments that will be passed to torch compile with HPU backend.

Source code in vllm_gaudi/utils.py

class HPUCompileConfig:
    """
    Configuration class, which holds arguments that will be
    passed to torch compile with HPU backend.
    """

    def __init__(self, fullgraph: Optional[bool] = None, dynamic: Optional[bool] = None):
        """
        Allow to override the environment variables for corner case scenarios
        when single functions are compiled with torch.compile decorator.
        Env variables should not be overwritten when it comes to compilation
        of the whole model.
        """
        self.fullgraph = fullgraph if fullgraph is not None else get_config().fullgraph_compilation
        self.dynamic = dynamic if dynamic is not None else get_config().dynamic_shapes_compilation
        self.regional_compilation = get_config().regional_compilation

    def get_compile_args(self) -> dict[str, Any]:
        """
        Returns a dictionary of compile arguments that can be used
        with torch.compile method or decorator
        """
        if self.dynamic:
            return {"backend": "hpu_backend", "fullgraph": self.fullgraph, "options": {"force_static_compile": True}}
        else:
            return {"backend": "hpu_backend", "fullgraph": self.fullgraph, "dynamic": False}

dynamic `instance-attribute` ¶

dynamic = (
    dynamic
    if dynamic is not None
    else dynamic_shapes_compilation
)

fullgraph `instance-attribute` ¶

fullgraph = (
    fullgraph
    if fullgraph is not None
    else fullgraph_compilation
)

regional_compilation `instance-attribute` ¶

regional_compilation = regional_compilation

init ¶

__init__(
    fullgraph: Optional[bool] = None,
    dynamic: Optional[bool] = None,
)

Allow to override the environment variables for corner case scenarios when single functions are compiled with torch.compile decorator. Env variables should not be overwritten when it comes to compilation of the whole model.

Source code in vllm_gaudi/utils.py

def __init__(self, fullgraph: Optional[bool] = None, dynamic: Optional[bool] = None):
    """
    Allow to override the environment variables for corner case scenarios
    when single functions are compiled with torch.compile decorator.
    Env variables should not be overwritten when it comes to compilation
    of the whole model.
    """
    self.fullgraph = fullgraph if fullgraph is not None else get_config().fullgraph_compilation
    self.dynamic = dynamic if dynamic is not None else get_config().dynamic_shapes_compilation
    self.regional_compilation = get_config().regional_compilation

get_compile_args ¶

get_compile_args() -> dict[str, Any]

Returns a dictionary of compile arguments that can be used with torch.compile method or decorator

Source code in vllm_gaudi/utils.py

def get_compile_args(self) -> dict[str, Any]:
    """
    Returns a dictionary of compile arguments that can be used
    with torch.compile method or decorator
    """
    if self.dynamic:
        return {"backend": "hpu_backend", "fullgraph": self.fullgraph, "options": {"force_static_compile": True}}
    else:
        return {"backend": "hpu_backend", "fullgraph": self.fullgraph, "dynamic": False}

_hpu_get_by_source ¶

_hpu_get_by_source(self, source: str) -> int

Source code in vllm_gaudi/utils.py

def _hpu_get_by_source(self, source: str) -> int:
    return max(0, _stats_get_by_source_orig(self, source))

async_h2d_copy ¶

async_h2d_copy(
    source, dest_tensor=None, dtype=None, device="hpu"
)

Asynchronously transfer data from host to device.

Parameters:

Name	Description	Default
`source`	CPU tensor or raw data to transfer	required
`dest_tensor`	Optional pre-allocated destination tensor	`None`
`dtype`	Required if source is raw data	`None`
`device`	Target device	`'hpu'`

Returns:

Type	Description
	torch.Tensor on target device

Source code in vllm_gaudi/utils.py

def async_h2d_copy(source, dest_tensor=None, dtype=None, device="hpu"):
    """
    Asynchronously transfer data from host to device.

    Args:
        source: CPU tensor or raw data to transfer
        dest_tensor: Optional pre-allocated destination tensor
        dtype: Required if source is raw data
        device: Target device

    Returns:
        torch.Tensor on target device
    """
    if isinstance(source, torch.Tensor):
        if dest_tensor is not None:
            # Copy into pre-allocated destination tensor
            return dest_tensor.copy_(source, non_blocking=True)
        # Create new device tensor and copy
        assert source.device.type == "cpu", "Source tensor must be on CPU for asynchronous transfer"
        target = torch.empty_like(source, device=device)
        return target.copy_(source, non_blocking=True)
    # Create tensor from data and transfer to device
    if dtype is None:
        raise ValueError("dtype must be specified when source is not a tensor")
    cpu_tensor = torch.tensor(source, dtype=dtype, device="cpu")
    return cpu_tensor.to(device, non_blocking=True)

async_h2d_update ¶

async_h2d_update(
    source: Tensor,
    dest: Tensor,
    indices: list[int],
    device="hpu",
)

Asynchronously update specific rows of a device tensor from a CPU tensor.

Parameters:

Name	Type	Description	Default
`source`	`Tensor`	CPU tensor with data to copy	required
`dest`	`Tensor`	Device tensor to update	required
`indices`	`list[int]`	List of row indices in dest to update	required
`device`		Target device	`'hpu'`

Source code in vllm_gaudi/utils.py

def async_h2d_update(source: torch.Tensor, dest: torch.Tensor, indices: list[int], device="hpu"):
    """
    Asynchronously update specific rows of a device tensor from a CPU tensor.

    Args:
        source: CPU tensor with data to copy
        dest: Device tensor to update
        indices: List of row indices in dest to update
        device: Target device
    """
    idx = torch.tensor(indices, dtype=torch.long, device=device)
    vals = source[indices].to(device, non_blocking=True)
    dest.index_copy_(0, idx, vals)

getattr_nested ¶

getattr_nested(obj: Any, name: str, *default: Any) -> Any

Like built-in getattr but supports dot-separated nested attributes.

Examples:

getattr_nested(obj, 'a.b.c') is equivalent to obj.a.b.c getattr_nested(obj, 'a.b', None) returns None when any intermediate or final attribute is missing.

Parameters:

Name	Type	Description	Default
`obj`	`Any`	Root object.	required
`name`	`str`	Dot-separated attribute path.	required
`*default`	`Any`	Optional default returned when the attribute is missing. At most one default value may be provided (same contract as built-in `getattr`).	`()`

Raises:

Type	Description
`TypeError`	If more than one default value is provided.
`AttributeError`	If the attribute is missing and no default was given.

Source code in vllm_gaudi/utils.py

def getattr_nested(obj: Any, name: str, *default: Any) -> Any:
    """Like built-in getattr but supports dot-separated nested attributes.

    Examples:
        getattr_nested(obj, 'a.b.c')  is equivalent to  obj.a.b.c
        getattr_nested(obj, 'a.b', None)  returns None when any
            intermediate or final attribute is missing.

    Args:
        obj: Root object.
        name: Dot-separated attribute path.
        *default: Optional default returned when the attribute is missing.
            At most one default value may be provided (same contract as
            built-in ``getattr``).

    Raises:
        TypeError: If more than one default value is provided.
        AttributeError: If the attribute is missing and no default was given.
    """
    if len(default) > 1:
        raise TypeError(f"getattr_nested expected at most 3 arguments, got {2 + len(default)}")
    parts = name.split(".")
    try:
        for part in parts:
            obj = getattr(obj, part)
        return obj
    except AttributeError:
        if default:
            return default[0]
        raise

has_quant_config ¶

has_quant_config(model_config: ModelConfig) -> bool

Source code in vllm_gaudi/utils.py

def has_quant_config(model_config: ModelConfig) -> bool:
    return model_config.quantization == "inc" or os.getenv("QUANT_CONFIG", None) is not None

hpu_backend_string `cached` ¶

hpu_backend_string()

Source code in vllm_gaudi/utils.py

@cache
def hpu_backend_string():
    backend_string = "hccl" if not is_fake_hpu() else "gloo"
    return backend_string

hpu_device_string `cached` ¶

hpu_device_string()

Source code in vllm_gaudi/utils.py

@cache
def hpu_device_string():
    device_string = "hpu" if not is_fake_hpu() else "cpu"
    return device_string

is_fake_hpu `cached` ¶

is_fake_hpu() -> bool

Source code in vllm_gaudi/utils.py

@cache
def is_fake_hpu() -> bool:
    return os.environ.get("VLLM_USE_FAKE_HPU", "0") != "0"

make_mrope_positions_tensor_with_pad ¶

make_mrope_positions_tensor_with_pad(
    input_positions: list[list[int]],
    input_mrope_positions: list[list[list[int]]],
    max_prompt_len: int,
    pad: int,
) -> list[list[int]]

Source code in vllm_gaudi/utils.py

def make_mrope_positions_tensor_with_pad(input_positions: list[list[int]], input_mrope_positions: list[list[list[int]]],
                                         max_prompt_len: int, pad: int) -> list[list[int]]:
    # If no mrope positions, returns a flatten (seq_len,)
    if all(mrope_position is None for mrope_position in input_mrope_positions):
        return torch_utils.make_tensor_with_pad(input_positions,
                                                max_len=max_prompt_len,
                                                pad=0,
                                                dtype=torch.long,
                                                device="cpu").flatten()
    # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len)
    # we are going to pad each seq_data in the list
    # using either MRope values or regular position
    mrope_input_positions: list[list[int]] = [[] for _ in range(3)]
    for idx in range(3):
        for b_idx, input_mrope_position in enumerate(input_mrope_positions):
            positions = input_mrope_position[idx] if input_mrope_position is not None else input_positions[b_idx]
            padding_size = max_prompt_len - len(positions)
            assert padding_size >= 0
            padded_positions = positions + (max_prompt_len - len(positions)) * [pad]
            mrope_input_positions[idx].extend(padded_positions)
    return torch.tensor(mrope_input_positions, dtype=torch.long, device="cpu")

make_ndarray_with_pad_align ¶

make_ndarray_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: DTypeLike,
    *,
    max_len_align: int = 1024,
) -> NDArray

Make a padded array from 2D inputs. The padding is applied to the end of each inner list until it reaches max_len.

Source code in vllm_gaudi/utils.py

def make_ndarray_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: npt.DTypeLike,
    *,
    max_len_align: int = 1024,
) -> npt.NDArray:
    """
    Make a padded array from 2D inputs.
    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    """
    # Unlike for most functions, map is faster than a genexpr over `len`
    max_len = max(map(len, x), default=0)
    max_len_aligned = math.ceil(max_len / max_len_align) * max_len_align
    padded_x = np.full((len(x), max_len_aligned), pad, dtype=dtype)

    for ind, blocktb in enumerate(x):
        assert len(blocktb) <= max_len_aligned
        padded_x[ind, :len(blocktb)] = blocktb

    return padded_x

make_tensor_with_pad_align ¶

make_tensor_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: dtype,
    *,
    max_len_align: int = 1024,
    device: Optional[Union[str, device]] = None,
    pin_memory: bool = False,
) -> Tensor

Make a padded tensor from 2D inputs. The padding is applied to the end of each inner list until it reaches max_len_aligned, max_len_aligned is max_len rounding to the nearest max_len_align.

Source code in vllm_gaudi/utils.py

def make_tensor_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: torch.dtype,
    *,
    max_len_align: int = 1024,
    device: Optional[Union[str, torch.device]] = None,
    pin_memory: bool = False,
) -> torch.Tensor:
    """
    Make a padded tensor from 2D inputs.
    The padding is applied to the end of each inner list until it reaches
    max_len_aligned, max_len_aligned is max_len rounding to the nearest
    `max_len_align`.
    """
    np_dtype = torch_utils.TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
    padded_x = make_ndarray_with_pad_align(x, pad, np_dtype, max_len_align=max_len_align)

    tensor = torch.from_numpy(padded_x).to(device)
    if pin_memory:
        tensor = tensor.pin_memory()

    return tensor

make_tensor_with_pad_hpu ¶

make_tensor_with_pad_hpu(
    x: list[list[T]],
    pad: T,
    dtype: dtype,
    *,
    max_len: int | None = None,
    device: Optional[Union[str, device]] = None,
    pin_memory: bool = False,
) -> Tensor

Make a padded tensor from 2D inputs.

The padding is applied to the end of each inner list until it reaches max_len.

HPU-compatible replacement for make_tensor_with_pad. Uses pure PyTorch (pad_sequence) instead of NumPy.

Source code in vllm_gaudi/utils.py

def make_tensor_with_pad_hpu(
    x: list[list[T]],
    pad: T,
    dtype: torch.dtype,
    *,
    max_len: int | None = None,
    device: Optional[Union[str, torch.device]] = None,
    pin_memory: bool = False,
) -> torch.Tensor:
    """
    Make a padded tensor from 2D inputs.

    The padding is applied to the end of each inner list until it reaches
    `max_len`.

    HPU-compatible replacement for make_tensor_with_pad.
    Uses pure PyTorch (pad_sequence) instead of NumPy.
    """
    if not x:
        return torch.empty((0, 0), dtype=dtype, device=device)

    # 1. Convert python lists to CPU tensors first
    tensors = [torch.tensor(item, dtype=dtype, device="cpu") for item in x]

    # 2. Use pad_sequence (pure torch implementation)
    #    batch_first=True -> (Batch, Seq)
    tensor = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True, padding_value=pad)

    # 3. Handle max_len truncation or specific padding if requested
    if max_len is not None:
        batch_size, seq_len = tensor.shape
        if seq_len < max_len:
            padding_size = max_len - seq_len
            padding = torch.full((batch_size, padding_size), pad, dtype=dtype, device="cpu")
            tensor = torch.cat((tensor, padding), dim=1)
        elif seq_len > max_len:
            tensor = tensor[:, :max_len]

    # 4. Move to target device
    tensor = tensor.to(device)

    # 5. Pin memory if requested
    if pin_memory:
        tensor = tensor.pin_memory()

    return tensor

patch_nixl_utils_for_hpu ¶

patch_nixl_utils_for_hpu()

Patch vllm.distributed.nixl_utils to use nixl._api instead of rixl._api.

Upstream vLLM gates NIXL imports on is_cuda(), falling back to rixl._api for all other platforms. HPU needs nixl._api (same as CUDA), so we monkey-patch the module-level symbols before anything else imports them.

Source code in vllm_gaudi/utils.py

def patch_nixl_utils_for_hpu():
    """Patch vllm.distributed.nixl_utils to use nixl._api instead of rixl._api.

    Upstream vLLM gates NIXL imports on is_cuda(), falling back to rixl._api
    for all other platforms. HPU needs nixl._api (same as CUDA), so we
    monkey-patch the module-level symbols before anything else imports them.
    """
    import logging

    logger = logging.getLogger(__name__)
    try:
        from nixl._api import nixl_agent as _NixlWrapper
        from nixl._api import nixl_agent_config as _nixl_agent_config
    except ImportError:
        return
    try:
        from nixl._bindings import nixlXferTelemetry as _nixlXferTelemetry
    except ImportError:
        _nixlXferTelemetry = None  # type: ignore[assignment]

    import vllm.distributed.nixl_utils as _nixl_mod

    _nixl_mod.NixlWrapper = _NixlWrapper  # type: ignore[attr-defined]
    _nixl_mod.nixl_agent_config = _nixl_agent_config  # type: ignore[attr-defined]
    if _nixlXferTelemetry is not None:
        _nixl_mod.nixlXferTelemetry = _nixlXferTelemetry  # type: ignore[attr-defined]
    logger.info("Patched vllm.distributed.nixl_utils for HPU (nixl._api)")

setattr_nested ¶

setattr_nested(obj: Any, name: str, value: Any) -> None

Like built-in setattr but supports dot-separated nested attributes.

Examples:

setattr_nested(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val

Parameters:

Name	Type	Description	Default
`obj`	`Any`	Root object.	required
`name`	`str`	Dot-separated attribute path. All parts except the last must already exist as attributes.	required
`value`	`Any`	Value to assign to the final attribute.	required

Raises:

Type	Description
`AttributeError`	If any intermediate attribute does not exist.

Source code in vllm_gaudi/utils.py

def setattr_nested(obj: Any, name: str, value: Any) -> None:
    """Like built-in setattr but supports dot-separated nested attributes.

    Examples:
        setattr_nested(obj, 'a.b.c', val)  is equivalent to  obj.a.b.c = val

    Args:
        obj: Root object.
        name: Dot-separated attribute path.  All parts except the last
            must already exist as attributes.
        value: Value to assign to the final attribute.

    Raises:
        AttributeError: If any intermediate attribute does not exist.
    """
    parts = name.split(".")
    for part in parts[:-1]:
        obj = getattr(obj, part)
    setattr(obj, parts[-1], value)

vllm_gaudi.utils ¶

T module-attribute ¶

U module-attribute ¶

_prompt_token_stats_cls module-attribute ¶

_stats_get_by_source_orig module-attribute ¶

HPUCompileConfig ¶

dynamic instance-attribute ¶

fullgraph instance-attribute ¶

regional_compilation instance-attribute ¶

__init__ ¶

get_compile_args ¶

_hpu_get_by_source ¶

async_h2d_copy ¶

async_h2d_update ¶

getattr_nested ¶

has_quant_config ¶

hpu_backend_string cached ¶

hpu_device_string cached ¶

is_fake_hpu cached ¶

make_mrope_positions_tensor_with_pad ¶

make_ndarray_with_pad_align ¶

make_tensor_with_pad_align ¶

make_tensor_with_pad_hpu ¶

patch_nixl_utils_for_hpu ¶

setattr_nested ¶

T `module-attribute` ¶

U `module-attribute` ¶

_prompt_token_stats_cls `module-attribute` ¶

_stats_get_by_source_orig `module-attribute` ¶

dynamic `instance-attribute` ¶

fullgraph `instance-attribute` ¶

regional_compilation `instance-attribute` ¶

init ¶

hpu_backend_string `cached` ¶

hpu_device_string `cached` ¶

is_fake_hpu `cached` ¶