vllm_gaudi.attention.oot_mla
¶
HPUMLAAttention
¶
Bases: MLAAttention
Source code in vllm_gaudi/attention/oot_mla.py
batch2block_matmul
instance-attribute
¶
block2batch_matmul
instance-attribute
¶
enable_fp8_attn
instance-attribute
¶
fused_scaled_dot_product_attention
instance-attribute
¶
fused_scaled_dot_product_attention = (
None
if HPUFusedSDPA is None
else ModuleFusedSDPA(HPUFusedSDPA)
)
k_cache
instance-attribute
¶
k_cache = (
VLLMKVCache()
if not enable_fp8_attn
else VLLMFP8KVCache()
)
latent_cache_k
instance-attribute
¶
latent_cache_k = (
VLLMKVCache()
if not enable_fp8_attn
else VLLMFP8KVCache()
)
v_cache
instance-attribute
¶
v_cache = (
VLLMKVCache(is_v_cache=True)
if not enable_fp8_attn
else VLLMFP8KVCache()
)
__init__
¶
Source code in vllm_gaudi/attention/oot_mla.py
_v_up_proj
¶
Source code in vllm_gaudi/attention/oot_mla.py
forward_impl
¶
forward_impl(
q: Tensor,
k_c_normed: Tensor,
k_pe: Tensor,
kv_cache: Tensor,
attn_metadata: HPUUnifiedAttentionMetadata,
output: Tensor | None = None,
output_scale: Tensor | None = None,
output_block_scale: Tensor | None = None,
) -> Tensor
Source code in vllm_gaudi/attention/oot_mla.py
HPUMultiHeadLatentAttentionWrapper
¶
Bases: MultiHeadLatentAttentionWrapper
Source code in vllm_gaudi/attention/oot_mla.py
mla_attn
instance-attribute
¶
mla_attn = HPUMLAAttention(
num_heads=num_heads,
scale=scale,
qk_nope_head_dim=qk_nope_head_dim,
qk_rope_head_dim=qk_rope_head_dim,
v_head_dim=v_head_dim,
q_lora_rank=q_lora_rank,
kv_lora_rank=kv_lora_rank,
cache_config=cache_config,
quant_config=quant_config,
prefix=layer_name,
kv_b_proj=kv_b_proj,
use_sparse=is_sparse,
indexer=indexer,
)
__init__
¶
__init__(
hidden_size: int,
num_heads: int,
scale: float,
qk_nope_head_dim: int,
qk_rope_head_dim: int,
v_head_dim: int,
q_lora_rank: int | None,
kv_lora_rank: int,
mla_modules,
cache_config=None,
quant_config=None,
prefix: str = "",
) -> None