vllm_gaudi.attention.ops.hpu_paged_attn ¶

_PARTITION_SIZE `module-attribute` ¶

_PARTITION_SIZE = 512

HPUPageAttentionInputBuilderBase `dataclass` ¶

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@dataclass
class HPUPageAttentionInputBuilderBase:
    pass

init ¶

__init__() -> None

HPUPagedAttention ¶

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

class HPUPagedAttention:

    @staticmethod
    def get_supported_head_sizes() -> list[int]:
        return list(range(1, 257))

    @classmethod
    def supports_attn_type(cls, attn_type: str) -> bool:
        """CPU attention supports decoder and encoder-only attention."""
        from vllm.attention.backends.abstract import AttentionType

        return attn_type in (
            AttentionType.DECODER,
            AttentionType.ENCODER,
            AttentionType.ENCODER_ONLY,
        )

    @staticmethod
    def get_kv_cache_shape(
        num_blocks: int,
        block_size: int,
        num_kv_heads: int,
        head_size: int,
    ) -> tuple[int, ...]:
        return (num_blocks * block_size, num_kv_heads, head_size)

    @staticmethod
    def split_kv_cache(
        kv_cache: tuple,
        num_kv_heads: int,
        head_size: int,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        key_cache = kv_cache[0]
        value_cache = kv_cache[1]
        k_scales = kv_cache[2]
        v_scales = kv_cache[3]
        return key_cache, value_cache, k_scales, v_scales

    @staticmethod
    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor,
                             slot_mapping: torch.Tensor, kv_cache_dtype: str, is_prompt: bool) -> None:
        cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, is_prompt)

    @staticmethod
    def forward_decode(**kwargs) -> torch.Tensor:
        if kwargs.get("kv_lora_rank"):
            return ops.flat_pa_mla(**kwargs)
        return ops.flat_pa(**kwargs)

    @staticmethod
    def swap_blocks(
        src_kv_cache: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
        dst_kv_cache: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
        src_to_dsts: torch.Tensor,
    ) -> None:
        src_key_cache = src_kv_cache[0]
        dst_key_cache = dst_kv_cache[0]
        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)

        src_value_cache = src_kv_cache[1]
        dst_value_cache = dst_kv_cache[1]
        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)

        src_key_scales = src_kv_cache[2]
        dst_key_scales = dst_kv_cache[2]
        src_value_scales = src_kv_cache[3]
        dst_value_scales = dst_kv_cache[3]
        if src_key_scales is not None:
            cache_ops.swap_blocks(src_key_scales, dst_key_scales, src_to_dsts)
        if src_value_scales is not None:
            cache_ops.swap_blocks(src_value_scales, dst_value_scales, src_to_dsts)

    @staticmethod
    def copy_blocks(
        kv_caches: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
        src_to_dsts: torch.Tensor,
    ) -> None:
        key_caches = [kv_cache[0] for kv_cache in kv_caches]
        value_caches = [kv_cache[1] for kv_cache in kv_caches]
        key_scales = [kv_cache[2] for kv_cache in kv_caches]
        value_scales = [kv_cache[3] for kv_cache in kv_caches]
        cache_ops.copy_blocks(key_caches, value_caches, key_scales, value_scales, src_to_dsts)

copy_blocks `staticmethod` ¶

copy_blocks(
    kv_caches: list[tuple[Tensor, Tensor, Tensor, Tensor]],
    src_to_dsts: Tensor,
) -> None

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def copy_blocks(
    kv_caches: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
    src_to_dsts: torch.Tensor,
) -> None:
    key_caches = [kv_cache[0] for kv_cache in kv_caches]
    value_caches = [kv_cache[1] for kv_cache in kv_caches]
    key_scales = [kv_cache[2] for kv_cache in kv_caches]
    value_scales = [kv_cache[3] for kv_cache in kv_caches]
    cache_ops.copy_blocks(key_caches, value_caches, key_scales, value_scales, src_to_dsts)

forward_decode `staticmethod` ¶

forward_decode(**kwargs) -> Tensor

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def forward_decode(**kwargs) -> torch.Tensor:
    if kwargs.get("kv_lora_rank"):
        return ops.flat_pa_mla(**kwargs)
    return ops.flat_pa(**kwargs)

get_kv_cache_shape `staticmethod` ¶

get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> tuple[int, ...]

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> tuple[int, ...]:
    return (num_blocks * block_size, num_kv_heads, head_size)

get_supported_head_sizes `staticmethod` ¶

get_supported_head_sizes() -> list[int]

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def get_supported_head_sizes() -> list[int]:
    return list(range(1, 257))

split_kv_cache `staticmethod` ¶

split_kv_cache(
    kv_cache: tuple, num_kv_heads: int, head_size: int
) -> tuple[Tensor, Tensor, Tensor, Tensor]

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def split_kv_cache(
    kv_cache: tuple,
    num_kv_heads: int,
    head_size: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    key_cache = kv_cache[0]
    value_cache = kv_cache[1]
    k_scales = kv_cache[2]
    v_scales = kv_cache[3]
    return key_cache, value_cache, k_scales, v_scales

supports_attn_type `classmethod` ¶

supports_attn_type(attn_type: str) -> bool

CPU attention supports decoder and encoder-only attention.

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
    """CPU attention supports decoder and encoder-only attention."""
    from vllm.attention.backends.abstract import AttentionType

    return attn_type in (
        AttentionType.DECODER,
        AttentionType.ENCODER,
        AttentionType.ENCODER_ONLY,
    )

swap_blocks `staticmethod` ¶

swap_blocks(
    src_kv_cache: tuple[Tensor, Tensor, Tensor, Tensor],
    dst_kv_cache: tuple[Tensor, Tensor, Tensor, Tensor],
    src_to_dsts: Tensor,
) -> None

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def swap_blocks(
    src_kv_cache: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
    dst_kv_cache: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
    src_to_dsts: torch.Tensor,
) -> None:
    src_key_cache = src_kv_cache[0]
    dst_key_cache = dst_kv_cache[0]
    cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)

    src_value_cache = src_kv_cache[1]
    dst_value_cache = dst_kv_cache[1]
    cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)

    src_key_scales = src_kv_cache[2]
    dst_key_scales = dst_kv_cache[2]
    src_value_scales = src_kv_cache[3]
    dst_value_scales = dst_kv_cache[3]
    if src_key_scales is not None:
        cache_ops.swap_blocks(src_key_scales, dst_key_scales, src_to_dsts)
    if src_value_scales is not None:
        cache_ops.swap_blocks(src_value_scales, dst_value_scales, src_to_dsts)

write_to_paged_cache `staticmethod` ¶

write_to_paged_cache(
    key: Tensor,
    value: Tensor,
    key_cache: Tensor,
    value_cache: Tensor,
    slot_mapping: Tensor,
    kv_cache_dtype: str,
    is_prompt: bool,
) -> None

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor,
                         slot_mapping: torch.Tensor, kv_cache_dtype: str, is_prompt: bool) -> None:
    cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, is_prompt)

HPUPagedAttentionMetadata `dataclass` ¶

Metadata for PagedAttention.

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@dataclass
class HPUPagedAttentionMetadata:
    """Metadata for PagedAttention."""
    block_list: Optional[torch.Tensor]
    block_mapping: Optional[torch.Tensor]
    block_usage: Optional[torch.Tensor]
    block_groups: Optional[torch.Tensor]
    alibi_blocks: Optional[torch.Tensor]

alibi_blocks `instance-attribute` ¶

alibi_blocks: Optional[Tensor]

block_groups `instance-attribute` ¶

block_groups: Optional[Tensor]

block_list `instance-attribute` ¶

block_list: Optional[Tensor]

block_mapping `instance-attribute` ¶

block_mapping: Optional[Tensor]

block_usage `instance-attribute` ¶

block_usage: Optional[Tensor]

init ¶

__init__(
    block_list: Optional[Tensor],
    block_mapping: Optional[Tensor],
    block_usage: Optional[Tensor],
    block_groups: Optional[Tensor],
    alibi_blocks: Optional[Tensor],
) -> None

HPUPagedAttentionMetadataBuilder `dataclass` ¶

Bases: AttentionMetadataBuilder

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@dataclass
class HPUPagedAttentionMetadataBuilder(AttentionMetadataBuilder):

    def __init__(self, input_builder: "HPUPageAttentionInputBuilderBase") -> None:
        """Create the builder, remember some configuration and parameters."""
        self.input_builder = input_builder

    def prepare(self) -> None:
        """Prepare for one batch."""
        pass

    def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int,
              batch_size: int) -> type[HPUPagedAttentionMetadata]:
        """Build attention metadata with on-device tensors."""
        return HPUPagedAttentionMetadata

input_builder `instance-attribute` ¶

input_builder = input_builder

init ¶

__init__(
    input_builder: HPUPageAttentionInputBuilderBase,
) -> None

Create the builder, remember some configuration and parameters.

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

def __init__(self, input_builder: "HPUPageAttentionInputBuilderBase") -> None:
    """Create the builder, remember some configuration and parameters."""
    self.input_builder = input_builder

build ¶

build(
    seq_lens: list[int],
    query_lens: list[int],
    cuda_graph_pad_size: int,
    batch_size: int,
) -> type[HPUPagedAttentionMetadata]

Build attention metadata with on-device tensors.

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int,
          batch_size: int) -> type[HPUPagedAttentionMetadata]:
    """Build attention metadata with on-device tensors."""
    return HPUPagedAttentionMetadata

prepare ¶

prepare() -> None

Prepare for one batch.

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

def prepare(self) -> None:
    """Prepare for one batch."""
    pass

vllm_gaudi.attention.ops.hpu_paged_attn ¶

_PARTITION_SIZE module-attribute ¶

HPUPageAttentionInputBuilderBase dataclass ¶

__init__ ¶

HPUPagedAttention ¶

copy_blocks staticmethod ¶

forward_decode staticmethod ¶

get_kv_cache_shape staticmethod ¶

get_supported_head_sizes staticmethod ¶

split_kv_cache staticmethod ¶

supports_attn_type classmethod ¶

swap_blocks staticmethod ¶

write_to_paged_cache staticmethod ¶

HPUPagedAttentionMetadata dataclass ¶

alibi_blocks instance-attribute ¶

block_groups instance-attribute ¶

block_list instance-attribute ¶

block_mapping instance-attribute ¶

block_usage instance-attribute ¶

__init__ ¶

HPUPagedAttentionMetadataBuilder dataclass ¶

input_builder instance-attribute ¶

__init__ ¶

build ¶

prepare ¶

_PARTITION_SIZE `module-attribute` ¶

HPUPageAttentionInputBuilderBase `dataclass` ¶

init ¶

copy_blocks `staticmethod` ¶

forward_decode `staticmethod` ¶

get_kv_cache_shape `staticmethod` ¶

get_supported_head_sizes `staticmethod` ¶

split_kv_cache `staticmethod` ¶

supports_attn_type `classmethod` ¶

swap_blocks `staticmethod` ¶

write_to_paged_cache `staticmethod` ¶

HPUPagedAttentionMetadata `dataclass` ¶

alibi_blocks `instance-attribute` ¶

block_groups `instance-attribute` ¶

block_list `instance-attribute` ¶

block_mapping `instance-attribute` ¶

block_usage `instance-attribute` ¶

init ¶

HPUPagedAttentionMetadataBuilder `dataclass` ¶

input_builder `instance-attribute` ¶

init ¶