`vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.utils.gather_scatter_helper` ¶

Classes:

CopyBufferAllocator –

Memory pool for tensor buffers to avoid frequent allocation/deallocation.

Functions:

gather_kv_caches –

Gather KV cache data from KV cache storage to destination tensor.
scatter_kv_caches –

Scatter KV cache data from source tensor to KV cache storage.

`CopyBufferAllocator` ¶

Memory pool for tensor buffers to avoid frequent allocation/deallocation.

Methods:

alloc_buffer –

Allocate buffers from the pool.
free_buffer –

Return buffers to the pool.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/hf3fs/utils/gather_scatter_helper.py

class CopyBufferAllocator:
    """Memory pool for tensor buffers to avoid frequent allocation/deallocation."""

    def __init__(
        self, device: torch.device, dtype: torch.dtype, shape: list, max_count: int
    ):
        self._shape = shape
        self._max_count = max_count
        self._device = device
        self._free_buffers = [
            torch.empty(shape, dtype=dtype, device=device) for _ in range(max_count)
        ]
        self._inuse_count = 0

    def alloc_buffer(self, count: int) -> list[torch.Tensor] | None:
        """Allocate buffers from the pool."""
        if count == 0:
            return []

        if self._inuse_count + count <= self._max_count:
            self._inuse_count += count
            result = self._free_buffers[-count:]
            del self._free_buffers[-count:]
            return result
        return None

    def free_buffer(self, buffers: list[torch.Tensor]) -> None:
        """Return buffers to the pool."""
        if not buffers:
            return

        if self._inuse_count >= len(buffers):
            self._inuse_count -= len(buffers)
            self._free_buffers.extend(buffers)
        else:
            raise RuntimeError("Attempted to free more buffers than allocated")

`alloc_buffer(count)` ¶

Allocate buffers from the pool.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/hf3fs/utils/gather_scatter_helper.py

def alloc_buffer(self, count: int) -> list[torch.Tensor] | None:
    """Allocate buffers from the pool."""
    if count == 0:
        return []

    if self._inuse_count + count <= self._max_count:
        self._inuse_count += count
        result = self._free_buffers[-count:]
        del self._free_buffers[-count:]
        return result
    return None

`free_buffer(buffers)` ¶

Return buffers to the pool.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/hf3fs/utils/gather_scatter_helper.py

def free_buffer(self, buffers: list[torch.Tensor]) -> None:
    """Return buffers to the pool."""
    if not buffers:
        return

    if self._inuse_count >= len(buffers):
        self._inuse_count -= len(buffers)
        self._free_buffers.extend(buffers)
    else:
        raise RuntimeError("Attempted to free more buffers than allocated")

`gather_kv_caches(kv_caches_ptrs, total_token_in_kvcache, dst_tensor, token_indices, is_mla=False)` ¶

Gather KV cache data from KV cache storage to destination tensor.

Parameters:

kv_caches_ptrs ¶
(Tensor) –

Tensor of KV cache pointers (one per layer)
total_token_in_kvcache ¶
(int) –

Total number of tokens in KV cache
dst_tensor ¶
(Tensor) –

Destination tensor to store gathered data - MHA format: [num_layers, 2, num_tokens_in_block, hidden_size] - MLA format: [num_layers, num_tokens_in_block, hidden_size]
token_indices ¶
(list[int]) –

List of token positions to gather
is_mla ¶
(bool, default: False ) –

Whether using MLA model format

Source code in vllm/distributed/kv_transfer/kv_connector/v1/hf3fs/utils/gather_scatter_helper.py

def gather_kv_caches(
    kv_caches_ptrs: torch.Tensor,
    total_token_in_kvcache: int,
    dst_tensor: torch.Tensor,
    token_indices: list[int],
    is_mla: bool = False,
) -> None:
    """Gather KV cache data from KV cache storage to destination tensor.

    Args:
        kv_caches_ptrs: Tensor of KV cache pointers (one per layer)
        total_token_in_kvcache: Total number of tokens in KV cache
        dst_tensor: Destination tensor to store gathered data
            - MHA format: [num_layers, 2, num_tokens_in_block, hidden_size]
            - MLA format: [num_layers, num_tokens_in_block, hidden_size]
        token_indices: List of token positions to gather
        is_mla: Whether using MLA model format
    """
    num_layers = kv_caches_ptrs.shape[0]
    num_tokens_in_block = len(token_indices)

    if is_mla:
        # MLA: dst_tensor is [num_layers, num_tokens_in_block, hidden_size]
        assert len(dst_tensor.shape) == 3, (
            f"MLA dst_tensor should be 3D, got {dst_tensor.shape}"
        )
        assert dst_tensor.shape[0] == num_layers, (
            f"Layer count mismatch: {dst_tensor.shape[0]} vs {num_layers}"
        )
        assert dst_tensor.shape[1] == num_tokens_in_block, (
            f"Token count mismatch: {dst_tensor.shape[1]} vs {num_tokens_in_block}"
        )
        hidden_size = dst_tensor.shape[2]
    else:
        # MHA: dst_tensor is [num_layers, 2, num_tokens_in_block, hidden_size]
        assert len(dst_tensor.shape) == 4, (
            f"MHA dst_tensor should be 4D, got {dst_tensor.shape}"
        )
        assert dst_tensor.shape[0] == num_layers, (
            f"Layer count mismatch: {dst_tensor.shape[0]} vs {num_layers}"
        )
        assert dst_tensor.shape[1] == 2, (
            f"MHA should have 2 (K,V) components, got {dst_tensor.shape[1]}"
        )
        assert dst_tensor.shape[2] == num_tokens_in_block, (
            f"Token count mismatch: {dst_tensor.shape[2]} vs {num_tokens_in_block}"
        )
        hidden_size = dst_tensor.shape[3]

    device = dst_tensor.device
    token_indices_tensor = torch.tensor(
        token_indices, dtype=torch.int32, device="cpu"
    ).to(device, non_blocking=True)

    grid = (num_layers, num_tokens_in_block)
    BLOCK_SIZE = 128

    kv_cache_gather_kernel[grid](
        kv_caches_ptrs,
        dst_tensor,
        token_indices_tensor,
        num_tokens_in_block,
        hidden_size,
        total_token_in_kvcache,
        num_layers,
        is_mla,
        BLOCK_SIZE=BLOCK_SIZE,
    )

`scatter_kv_caches(kv_caches_ptrs, total_token_in_kvcache, src_tensor, token_indices, is_mla=False)` ¶

Scatter KV cache data from source tensor to KV cache storage.

Parameters:

kv_caches_ptrs ¶
(Tensor) –

Tensor of KV cache pointers (one per layer)
total_token_in_kvcache ¶
(int) –

Total number of tokens in KV cache
src_tensor ¶
(Tensor) –

Source tensor containing data to scatter - MHA format: [num_layers, 2, num_tokens_in_block, hidden_size] - MLA format: [num_layers, num_tokens_in_block, hidden_size]
token_indices ¶
(list[int]) –

List of token positions to update
is_mla ¶
(bool, default: False ) –

Whether using MLA model format

Source code in vllm/distributed/kv_transfer/kv_connector/v1/hf3fs/utils/gather_scatter_helper.py

def scatter_kv_caches(
    kv_caches_ptrs: torch.Tensor,
    total_token_in_kvcache: int,
    src_tensor: torch.Tensor,
    token_indices: list[int],
    is_mla: bool = False,
) -> None:
    """Scatter KV cache data from source tensor to KV cache storage.

    Args:
        kv_caches_ptrs: Tensor of KV cache pointers (one per layer)
        total_token_in_kvcache: Total number of tokens in KV cache
        src_tensor: Source tensor containing data to scatter
            - MHA format: [num_layers, 2, num_tokens_in_block, hidden_size]
            - MLA format: [num_layers, num_tokens_in_block, hidden_size]
        token_indices: List of token positions to update
        is_mla: Whether using MLA model format
    """
    num_layers = len(kv_caches_ptrs)
    num_tokens_in_block = len(token_indices)

    if is_mla:
        # MLA: src_tensor is [num_layers, num_tokens_in_block, hidden_size]
        assert len(src_tensor.shape) == 3, (
            f"MLA src_tensor should be 3D, got {src_tensor.shape}"
        )
        hidden_size = src_tensor.shape[2]
    else:
        # MHA: src_tensor is [num_layers, 2, num_tokens_in_block, hidden_size]
        assert len(src_tensor.shape) == 4, (
            f"MHA src_tensor should be 4D, got {src_tensor.shape}"
        )
        hidden_size = src_tensor.shape[3]

    device = src_tensor.device
    token_indices_tensor = torch.tensor(
        token_indices, dtype=torch.int32, device="cpu"
    ).to(device, non_blocking=True)

    grid = (num_layers, num_tokens_in_block)
    BLOCK_SIZE = 128

    kv_cache_scatter_kernel[grid](
        kv_caches_ptrs,
        src_tensor,
        token_indices_tensor,
        num_tokens_in_block,
        hidden_size,
        total_token_in_kvcache,
        num_layers,
        is_mla,
        BLOCK_SIZE=BLOCK_SIZE,
    )

`vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.utils.gather_scatter_helper` ¶

`CopyBufferAllocator` ¶

`alloc_buffer(count)` ¶

`free_buffer(buffers)` ¶

`gather_kv_caches(kv_caches_ptrs, total_token_in_kvcache, dst_tensor, token_indices, is_mla=False)` ¶

`kv_caches_ptrs` ¶

`total_token_in_kvcache` ¶

`dst_tensor` ¶

`token_indices` ¶

`is_mla` ¶

`scatter_kv_caches(kv_caches_ptrs, total_token_in_kvcache, src_tensor, token_indices, is_mla=False)` ¶

`kv_caches_ptrs` ¶

`total_token_in_kvcache` ¶

`src_tensor` ¶

`token_indices` ¶

`is_mla` ¶

vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.utils.gather_scatter_helper ¶

CopyBufferAllocator ¶

alloc_buffer(count) ¶

free_buffer(buffers) ¶

gather_kv_caches(kv_caches_ptrs, total_token_in_kvcache, dst_tensor, token_indices, is_mla=False) ¶

kv_caches_ptrs ¶

total_token_in_kvcache ¶

dst_tensor ¶

token_indices ¶

is_mla ¶

scatter_kv_caches(kv_caches_ptrs, total_token_in_kvcache, src_tensor, token_indices, is_mla=False) ¶

kv_caches_ptrs ¶

total_token_in_kvcache ¶

src_tensor ¶

token_indices ¶

is_mla ¶

`vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.utils.gather_scatter_helper` ¶

`CopyBufferAllocator` ¶

`alloc_buffer(count)` ¶

`free_buffer(buffers)` ¶

`gather_kv_caches(kv_caches_ptrs, total_token_in_kvcache, dst_tensor, token_indices, is_mla=False)` ¶

`kv_caches_ptrs` ¶

`total_token_in_kvcache` ¶

`dst_tensor` ¶

`token_indices` ¶

`is_mla` ¶

`scatter_kv_caches(kv_caches_ptrs, total_token_in_kvcache, src_tensor, token_indices, is_mla=False)` ¶

`kv_caches_ptrs` ¶

`total_token_in_kvcache` ¶

`src_tensor` ¶

`token_indices` ¶

`is_mla` ¶