`vllm.v1.core.block_pool` ¶

Classes:

BlockHashToBlockMap –

Cache of blocks that are used for prefix caching. It caches blocks
BlockPool –

BlockPool that manages KVCacheBlocks.

`BlockHashToBlockMap` ¶

Cache of blocks that are used for prefix caching. It caches blocks from hash directly to a block or multiple blocks (i.e. {block_hash: KVCacheBlocks}) - Mostly block_hash maps to a single KVCacheBlock, and KVCacheBlocks would simply be a KVCacheBlock. - Otherwise, KVCacheBlocks is a dict from {block_id: KVCacheBlock}

A cached block is a full block with a block hash that can be used for prefix caching. The cached block may be used by running requests or in the free_block_queue that could potentially be evicted.

NOTE #1: We currently don't de-duplicate the blocks in the cache, meaning that if a block becomes full and is cached, we don't check if there is already an identical block in the cache. This is because we want to make sure the allocated block IDs won't change so that block tables are append-only. NOTE #2: The union type is introduced in order to reduce GC costs from the inner dict.

Methods:

contain –

Checks whether the key maps to the given block ID.
get_one_block –

Gets any block with the given block hash key.
insert –

Inserts the KVCacheBlock to the cache
pop –

Checks if block_hash exists and pop block_id from the cache

Source code in vllm/v1/core/block_pool.py

class BlockHashToBlockMap:
    """
    Cache of blocks that are used for prefix caching. It caches blocks
    from hash directly to a block or multiple blocks
    (i.e. {block_hash: KVCacheBlocks})
    - Mostly block_hash maps to a single KVCacheBlock, and KVCacheBlocks
        would simply be a KVCacheBlock.
    - Otherwise, KVCacheBlocks is a dict from {block_id: KVCacheBlock}

    A cached block is a full block with a block hash that can be used
    for prefix caching.
    The cached block may be used by running requests or in the
    free_block_queue that could potentially be evicted.

    NOTE #1: We currently don't de-duplicate the blocks in the cache,
    meaning that if a block becomes full and is cached, we don't check
    if there is already an identical block in the cache. This is because
    we want to make sure the allocated block IDs won't change so that
    block tables are append-only.
    NOTE #2: The union type is introduced in order to reduce GC costs
    from the inner dict.
    """

    def __init__(self):
        self._cache: dict[
            BlockHashWithGroupId, KVCacheBlock | dict[int, KVCacheBlock]
        ] = {}

    def get_one_block(self, key: BlockHashWithGroupId) -> KVCacheBlock | None:
        """
        Gets any block with the given block hash key.
        """
        blocks = self._cache.get(key)
        if blocks is not None:
            if isinstance(blocks, KVCacheBlock):
                return blocks
            if isinstance(blocks, dict):
                return next(iter(blocks.values()))
            self._unexpected_blocks_type(blocks)
        return None

    def contain(self, key: BlockHashWithGroupId, block_id: int) -> bool:
        """
        Checks whether the key maps to the given block ID.
        """
        blocks = self._cache.get(key)
        if blocks is None:
            return False
        if isinstance(blocks, KVCacheBlock):
            return blocks.block_id == block_id
        if isinstance(blocks, dict):
            return block_id in blocks
        self._unexpected_blocks_type(blocks)
        return False

    def insert(self, key: BlockHashWithGroupId, block: KVCacheBlock) -> None:
        """
        Inserts the KVCacheBlock to the cache
        """
        blocks = self._cache.get(key)
        if blocks is None:
            # When key is not found, attach a single block to the key
            self._cache[key] = block
        elif isinstance(blocks, KVCacheBlock):
            # If there's a block with the same key, merge the original block
            # and the new block into a dict
            self._cache[key] = {blocks.block_id: blocks, block.block_id: block}
        elif isinstance(blocks, dict):
            # If it's already a dict, simply insert the block
            blocks[block.block_id] = block
        else:
            self._unexpected_blocks_type(blocks)

    def pop(self, key: BlockHashWithGroupId, block_id: int) -> KVCacheBlock | None:
        """
        Checks if block_hash exists and pop block_id from the cache
        """
        blocks = self._cache.pop(key, None)
        if blocks is None:
            # block_hash not found in the cache
            return None
        # TODO(Jialin): If key is found, block_id should always present
        # in blocks. We currently keep the original behaviour for safety.
        #
        # Will add block_id == blocks.block_id assertion and
        # use del blocks[block_id] instead as followup.
        if isinstance(blocks, KVCacheBlock):
            if blocks.block_id == block_id:
                return blocks
            # If the single block ID doesn't match, we should put the
            # block back (it should happen rarely)
            self._cache[key] = blocks
            return None
        if isinstance(blocks, dict):
            # Try to pop block_id from the block dict, and if dict still
            # contain blocks, put back to the cache.
            block = blocks.pop(block_id, None)
            if len(blocks) > 0:
                self._cache[key] = blocks
            return block
        self._unexpected_blocks_type(blocks)
        return None

    def __len__(self) -> int:
        return len(self._cache)

    def _unexpected_blocks_type(self, blocks: Any) -> None:
        raise AssertionError(f"Invalid KV cache block type {type(blocks)}")

`contain(key, block_id)` ¶

Checks whether the key maps to the given block ID.

Source code in vllm/v1/core/block_pool.py

def contain(self, key: BlockHashWithGroupId, block_id: int) -> bool:
    """
    Checks whether the key maps to the given block ID.
    """
    blocks = self._cache.get(key)
    if blocks is None:
        return False
    if isinstance(blocks, KVCacheBlock):
        return blocks.block_id == block_id
    if isinstance(blocks, dict):
        return block_id in blocks
    self._unexpected_blocks_type(blocks)
    return False

`get_one_block(key)` ¶

Gets any block with the given block hash key.

Source code in vllm/v1/core/block_pool.py

def get_one_block(self, key: BlockHashWithGroupId) -> KVCacheBlock | None:
    """
    Gets any block with the given block hash key.
    """
    blocks = self._cache.get(key)
    if blocks is not None:
        if isinstance(blocks, KVCacheBlock):
            return blocks
        if isinstance(blocks, dict):
            return next(iter(blocks.values()))
        self._unexpected_blocks_type(blocks)
    return None

`insert(key, block)` ¶

Inserts the KVCacheBlock to the cache

Source code in vllm/v1/core/block_pool.py

def insert(self, key: BlockHashWithGroupId, block: KVCacheBlock) -> None:
    """
    Inserts the KVCacheBlock to the cache
    """
    blocks = self._cache.get(key)
    if blocks is None:
        # When key is not found, attach a single block to the key
        self._cache[key] = block
    elif isinstance(blocks, KVCacheBlock):
        # If there's a block with the same key, merge the original block
        # and the new block into a dict
        self._cache[key] = {blocks.block_id: blocks, block.block_id: block}
    elif isinstance(blocks, dict):
        # If it's already a dict, simply insert the block
        blocks[block.block_id] = block
    else:
        self._unexpected_blocks_type(blocks)

`pop(key, block_id)` ¶

Checks if block_hash exists and pop block_id from the cache

Source code in vllm/v1/core/block_pool.py

def pop(self, key: BlockHashWithGroupId, block_id: int) -> KVCacheBlock | None:
    """
    Checks if block_hash exists and pop block_id from the cache
    """
    blocks = self._cache.pop(key, None)
    if blocks is None:
        # block_hash not found in the cache
        return None
    # TODO(Jialin): If key is found, block_id should always present
    # in blocks. We currently keep the original behaviour for safety.
    #
    # Will add block_id == blocks.block_id assertion and
    # use del blocks[block_id] instead as followup.
    if isinstance(blocks, KVCacheBlock):
        if blocks.block_id == block_id:
            return blocks
        # If the single block ID doesn't match, we should put the
        # block back (it should happen rarely)
        self._cache[key] = blocks
        return None
    if isinstance(blocks, dict):
        # Try to pop block_id from the block dict, and if dict still
        # contain blocks, put back to the cache.
        block = blocks.pop(block_id, None)
        if len(blocks) > 0:
            self._cache[key] = blocks
        return block
    self._unexpected_blocks_type(blocks)
    return None

`BlockPool` ¶

BlockPool that manages KVCacheBlocks. It provides methods to allocate, free and cache the kv cache blocks. The free_block_queue stores the free blocks in eviction order to enable allocation, free, and cache eviction. The cached_block_hash_to_block maps between block hash and cached block to support finding cached blocks by their block hash.

Parameters:

num_gpu_blocks ¶
(int) –

The number of blocks in the pool.
enable_caching ¶
(bool) –

Whether to enable prefix caching.
hash_block_size ¶
(int) –

The block size of which the block hashes are computed. The actual block size usually equals hash_block_size, but in cases where different KV cache groups have different block sizes, the actual block size can be a multiple of hash_block_size.
enable_kv_cache_events ¶
(bool, default: False ) –

Whether to enable kv cache events.
metrics_collector ¶
(KVCacheMetricsCollector | None, default: None ) –

Optional metrics collector for tracking block residency.

Methods:

cache_full_blocks –

Cache a list of full blocks for prefix caching.
cache_partial_block –

Register a partial prefix-cache entry for an existing block.
emit_cached_block_events –

Generate BlockStored events for blocks reused from prefix cache.
evict_blocks –

evict blocks from the prefix cache by their block IDs.
free_blocks –

Free a list of blocks. The blocks should be ordered by their
get_cached_block –

Get the cached block by the block hash for each group in
get_new_blocks –

Get new blocks from the free block pool.
get_num_free_blocks –

Get the number of free blocks in the pool.
get_usage –

Get the KV cache usage.
move_block_hashes –

Re-point src_block's prefix-cache entries to dst_block.
reset_prefix_cache –

Reset prefix cache. This function may be used in RLHF
take_events –

Atomically takes all events and clears the queue.
touch –

Touch a block increases its reference count by 1, and may remove

Source code in vllm/v1/core/block_pool.py

class BlockPool:
    """BlockPool that manages KVCacheBlocks.
    It provides methods to allocate, free and cache the kv cache blocks. The
    free_block_queue stores the free blocks in eviction order to enable
    allocation, free, and cache eviction. The cached_block_hash_to_block
    maps between block hash and cached block to support finding cached blocks
    by their block hash.

    Args:
        num_gpu_blocks: The number of blocks in the pool.
        enable_caching: Whether to enable prefix caching.
        hash_block_size: The block size of which the block hashes are computed.
            The actual block size usually equals hash_block_size, but in cases
            where different KV cache groups have different block sizes, the
            actual block size can be a multiple of hash_block_size.
        enable_kv_cache_events: Whether to enable kv cache events.
        metrics_collector: Optional metrics collector for tracking block residency.
    """

    def __init__(
        self,
        num_gpu_blocks: int,
        enable_caching: bool,
        hash_block_size: int,
        enable_kv_cache_events: bool = False,
        metrics_collector: KVCacheMetricsCollector | None = None,
    ):
        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
        self.num_gpu_blocks = num_gpu_blocks
        self.enable_caching = enable_caching
        self.hash_block_size = hash_block_size
        # All kv-cache blocks.
        self.blocks: list[KVCacheBlock] = [
            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
        ]
        # Free block queue that constructs and manipulates a doubly linked
        # list of free blocks (including eviction candidates when caching is
        # enabled).
        self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)

        # Cache for block lookup
        self.cached_block_hash_to_block: BlockHashToBlockMap = BlockHashToBlockMap()
        self.cached_block_hashes_by_block: dict[int, set[BlockHashWithGroupId]] = {}

        # To represent a placeholder block with block_id=0.
        # The ref_cnt of null_block is not maintained, needs special care to
        # avoid freeing it.
        self.null_block = self.free_block_queue.popleft()
        self.null_block.is_null = True

        self.enable_kv_cache_events = enable_kv_cache_events
        self.kv_event_queue: list[KVCacheEvent] = []

        self.metrics_collector = metrics_collector

    def get_cached_block(
        self, block_hash: BlockHash, kv_cache_group_ids: list[int]
    ) -> list[KVCacheBlock] | None:
        """Get the cached block by the block hash for each group in
        `kv_cache_group_ids`, or None if cache miss for any group.
        If there are duplicated blocks, we return the first block in the cache.

        Args:
            block_hash: The hash value of the block.
            kv_cache_group_ids: The ids of the KV cache groups.

        Returns:
            The cached blocks if exists, or None.
        """
        cached_blocks = []
        for group_id in kv_cache_group_ids:
            block_hash_with_group_id = make_block_hash_with_group_id(
                block_hash, group_id
            )
            block = self.cached_block_hash_to_block.get_one_block(
                block_hash_with_group_id
            )
            if not block:
                return None
            cached_blocks.append(block)
        return cached_blocks

    def cache_full_blocks(
        self,
        request: Request,
        blocks: list[KVCacheBlock],
        num_cached_blocks: int,
        num_full_blocks: int,
        block_size: int,
        kv_cache_group_id: int,
        block_mask: list[bool] | None = None,
    ) -> None:
        """Cache a list of full blocks for prefix caching.
        This function takes a list of blocks that will have their block hash
        metadata to be updated and cached. Given a request, it updates the
        metadata for each block and caching it in the
        `cached_block_hash_to_block`.
        The block hashes values are computed by the Request object immediately
        when it is created and when new tokens are appended.

        Args:
            request: The request to cache the blocks.
            blocks: All blocks in the request.
            num_cached_blocks: The number of blocks that are already cached.
            num_full_blocks: The number of blocks that are full and should
                be cached after this function.
            block_size: Number of tokens in each block.
            kv_cache_group_id: The id of the KV cache group.
            block_mask: Optional mask aligned with
                ``blocks[num_cached_blocks:num_full_blocks]``. When provided,
                blocks where the mask is False are skipped (treated like null
                blocks). Used by groups whose ``find_longest_cache_hit`` only
                consults a subset of blocks (e.g. SWA tail-window), so blocks
                that can never serve a hit stay out of the prefix-cache hash
                map.
        """
        if num_cached_blocks >= num_full_blocks:
            return
        new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
        assert block_mask is None or len(block_mask) == len(new_full_blocks)
        block_hashes = resolve_block_hashes(
            request.block_hashes, self.hash_block_size, block_size
        )

        new_block_hashes = block_hashes[num_cached_blocks:]
        new_hashes: list[ExternalBlockHash] | None = (
            [] if self.enable_kv_cache_events else None
        )
        for i, blk in enumerate(new_full_blocks):
            # Some blocks may be null or masked out when enabling sparse attention
            # like sliding window attention, or Mamba models with prefix-caching
            # in align mode. We skip null blocks here.
            if blk.is_null or (block_mask is not None and not block_mask[i]):
                continue
            block_hash = new_block_hashes[i]
            num_hash_tokens = (num_cached_blocks + i + 1) * block_size

            # Update and added the full block to the cache.
            block_hash_with_group_id = make_block_hash_with_group_id(
                block_hash, kv_cache_group_id
            )
            if blk.block_hash is not None:
                # The only valid case where a "new full block" already has a
                # hash is partial->full promotion of the same cache block.
                assert (
                    blk.block_hash_num_tokens is not None
                    and blk.block_hash_num_tokens < num_hash_tokens
                )
                removed_hashes = self._remove_cached_block_hashes(blk)
                self._emit_block_removed_events(removed_hashes)
            self._insert_block_hash(
                block_hash_with_group_id,
                blk,
                num_tokens=num_hash_tokens,
            )
            if new_hashes is not None:
                new_hashes.append(maybe_convert_block_hash(block_hash))

        if self.enable_kv_cache_events:
            if num_cached_blocks == 0:
                parent_block_hash: ExternalBlockHash | None = None
            else:
                parent_block_hash = maybe_convert_block_hash(
                    block_hashes[num_cached_blocks - 1]
                )

            # Calculate token range for the blocks being cached
            start_token_idx = num_cached_blocks * block_size
            end_token_idx = num_full_blocks * block_size

            # Generate extra keys for each block individually.
            # Each block may have different extra_keys (e.g., different MM
            # features, or cache_salt only for the first block).
            # Skip null/masked-out blocks to match the length of new_hashes.
            extra_keys_list: list[tuple[Any, ...] | None] = []
            curr_mm_idx = 0
            for i in range(num_cached_blocks, num_full_blocks):
                if blocks[i].is_null:
                    continue
                if block_mask is not None and not block_mask[i - num_cached_blocks]:
                    continue
                block_start = i * block_size
                block_end = block_start + block_size
                extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                    request, block_start, block_end, curr_mm_idx
                )
                extra_keys_list.append(extra_keys)

            self.kv_event_queue.append(
                self._build_block_stored_event(
                    request,
                    block_hashes=new_hashes,
                    parent_block_hash=parent_block_hash,
                    start_token_idx=start_token_idx,
                    end_token_idx=end_token_idx,
                    block_size=block_size,
                    kv_cache_group_id=kv_cache_group_id,
                    extra_keys_list=extra_keys_list,
                )
            )

    def _build_block_stored_event(
        self,
        request: Request,
        block_hashes: list[ExternalBlockHash] | None,
        parent_block_hash: ExternalBlockHash | None,
        start_token_idx: int,
        end_token_idx: int,
        block_size: int,
        kv_cache_group_id: int,
        extra_keys_list: list[tuple[Any, ...] | None],
    ) -> BlockStored:
        """Build a ``BlockStored`` KV event for ``request``.

        Shared by ``cache_full_blocks`` (newly cached blocks) and
        ``emit_cached_block_events`` (prefix-cache-reused blocks) so both emit
        identical event shapes for downstream consumers.
        """
        return BlockStored(
            block_hashes=block_hashes,
            parent_block_hash=parent_block_hash,
            token_ids=request.all_token_ids[start_token_idx:end_token_idx],
            block_size=block_size,
            lora_id=request.lora_request.adapter_id if request.lora_request else None,
            medium=MEDIUM_GPU,
            lora_name=request.lora_request.name if request.lora_request else None,
            extra_keys=extra_keys_list if extra_keys_list else None,
            group_idx=kv_cache_group_id,
        )

    def emit_cached_block_events(
        self,
        request: Request,
        num_cached_blocks: int,
        block_size: int,
        kv_cache_group_id: int,
    ) -> None:
        """Generate BlockStored events for blocks reused from prefix cache.

        Unlike cache_full_blocks(), this does NOT modify block state —
        the blocks are already cached. It only generates events so that
        external consumers (e.g. gateway) can learn about reused blocks.

        Args:
            request: The request whose prefix cache blocks were reused.
            num_cached_blocks: Number of blocks that were cache hits.
            block_size: Number of tokens per block.
            kv_cache_group_id: The KV cache group ID.
        """
        if not self.enable_kv_cache_events or num_cached_blocks == 0:
            return

        block_hashes = resolve_block_hashes(
            request.block_hashes, self.hash_block_size, block_size
        )

        # Collect external hashes and extra_keys for cached blocks.
        cached_hashes: list[ExternalBlockHash] = []
        extra_keys_list: list[tuple[Any, ...] | None] = []
        curr_mm_idx = 0
        for i in range(num_cached_blocks):
            block_start = i * block_size
            block_end = block_start + block_size
            cached_hashes.append(maybe_convert_block_hash(block_hashes[i]))
            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                request, block_start, block_end, curr_mm_idx
            )
            extra_keys_list.append(extra_keys)

        if not cached_hashes:
            return

        # Prefix-cache hits always form a contiguous prefix starting at block 0,
        # so the first (and thus the whole group's) parent block hash is None.
        parent_block_hash: ExternalBlockHash | None = None
        start_token_idx = 0
        end_token_idx = num_cached_blocks * block_size

        logger.debug(
            "EmitCachedBlock event: block_size=%d, "
            "num_cached_blocks=%d, parent_block_hash=%s, "
            "token_ids_len=%d, group_idx=%s",
            block_size,
            num_cached_blocks,
            parent_block_hash,
            len(request.all_token_ids[start_token_idx:end_token_idx]),
            kv_cache_group_id,
        )

        self.kv_event_queue.append(
            self._build_block_stored_event(
                request,
                block_hashes=cached_hashes,
                parent_block_hash=parent_block_hash,
                start_token_idx=start_token_idx,
                end_token_idx=end_token_idx,
                block_size=block_size,
                kv_cache_group_id=kv_cache_group_id,
                extra_keys_list=extra_keys_list,
            )
        )

    def cache_partial_block(
        self,
        request: Request,
        block: KVCacheBlock,
        num_tokens: int,
        kv_cache_group_id: int,
        block_size: int,
    ) -> BlockHashWithGroupId | None:
        """Register a partial prefix-cache entry for an existing block.

        Prefix-cache keys normally identify full cache blocks. A partial entry
        makes an existing cache block reachable from a fine-grained prefix
        boundary inside that block without allocating or copying a new
        ``KVCacheBlock``.

        The partial entry is lookup metadata owned by ``block``. If ``block``
        has no primary hash, the key becomes its primary hash. If the block
        already has a primary hash, the partial entry is tracked in
        ``cached_block_hashes_by_block`` so eviction, reset, and promotion can
        remove every hash key that points to the block.

        Args:
            request: Request whose token IDs and block hashes define the
                partial entry.
            block: Existing cache block to make reachable from the partial
                prefix boundary.
            num_tokens: Prefix length represented by the partial entry. It
                must be a positive multiple of ``self.hash_block_size`` and
                cannot exceed the request's computed block hashes.
            kv_cache_group_id: KV cache group that owns the partial entry.
            block_size: Cache block size for the owning group. The partial
                entry hash itself is always the prefix-chain hash at
                ``num_tokens``; ``block_size`` is used to assert that the
                entry is partial within the owning cache block.

        Returns:
            The hash key with group ID if a partial entry can be registered;
            otherwise ``None`` for null blocks.
        """
        if block.is_null:
            return None

        assert block_size > self.hash_block_size
        assert block_size % self.hash_block_size == 0
        assert num_tokens % block_size != 0
        block_hash = self._get_partial_block_hash(request, num_tokens)
        num_hash_blocks = num_tokens // self.hash_block_size
        block_hash_with_group_id = make_block_hash_with_group_id(
            block_hash, kv_cache_group_id
        )
        already_cached = block.block_hash == block_hash_with_group_id or (
            self.cached_block_hash_to_block.contain(
                block_hash_with_group_id, block.block_id
            )
        )
        if (
            not already_cached
            and block.block_hash is not None
            and block.block_hash_num_tokens is not None
            and block.block_hash_num_tokens < num_hash_blocks * self.hash_block_size
        ):
            removed_hashes = self._remove_cached_block_hashes(block)
            self._emit_block_removed_events(removed_hashes)
        self._insert_block_hash(
            block_hash_with_group_id,
            block,
            num_tokens=num_hash_blocks * self.hash_block_size,
        )
        if self.enable_kv_cache_events and not already_cached:
            parent_hash, block_start = self._get_partial_block_parent_hash_and_start(
                request, num_tokens
            )
            parent_block_hash = (
                maybe_convert_block_hash(parent_hash)
                if parent_hash is not None
                else None
            )
            block_end = num_tokens
            curr_mm_idx = -1 if block_start > 0 else 0
            extra_keys, _ = generate_block_hash_extra_keys(
                request, block_start, block_end, curr_mm_idx
            )
            self.kv_event_queue.append(
                BlockStored(
                    block_hashes=[maybe_convert_block_hash(block_hash)],
                    parent_block_hash=parent_block_hash,
                    token_ids=request.all_token_ids[block_start:block_end],
                    block_size=block_end - block_start,
                    lora_id=request.lora_request.adapter_id
                    if request.lora_request
                    else None,
                    medium=MEDIUM_GPU,
                    lora_name=request.lora_request.name
                    if request.lora_request
                    else None,
                    extra_keys=[extra_keys],
                    group_idx=kv_cache_group_id,
                )
            )
        return block_hash_with_group_id

    def _get_partial_block_hash(
        self,
        request: Request,
        num_tokens: int,
    ) -> BlockHash:
        assert num_tokens % self.hash_block_size == 0
        num_hash_blocks = num_tokens // self.hash_block_size
        assert 0 < num_hash_blocks <= len(request.block_hashes)

        # Each hash_block_size hash chains over its full prefix, so the partial
        # entry for any group block size is the hash at that prefix boundary.
        return request.block_hashes[num_hash_blocks - 1]

    def _get_partial_block_parent_hash_and_start(
        self,
        request: Request,
        num_tokens: int,
    ) -> tuple[BlockHash | None, int]:
        num_hash_blocks = num_tokens // self.hash_block_size
        parent_hash = (
            request.block_hashes[num_hash_blocks - 2] if num_hash_blocks > 1 else None
        )
        block_start = (num_hash_blocks - 1) * self.hash_block_size
        return parent_hash, block_start

    def _remove_cached_block_hashes(
        self,
        block: KVCacheBlock,
    ) -> list[BlockHashWithGroupId]:
        block_hashes: list[BlockHashWithGroupId] = []
        if block.block_hash is not None:
            block_hashes.append(block.block_hash)
        block_hashes.extend(self.cached_block_hashes_by_block.pop(block.block_id, ()))
        if not block_hashes:
            return []

        removed_hashes: list[BlockHashWithGroupId] = []
        for block_hash in block_hashes:
            if (
                self.cached_block_hash_to_block.pop(block_hash, block.block_id)
                is not None
            ):
                removed_hashes.append(block_hash)
        block.reset_hash()
        return removed_hashes

    def _emit_block_removed_events(
        self,
        block_hashes: list[BlockHashWithGroupId],
    ) -> None:
        if not self.enable_kv_cache_events:
            return
        for block_hash in block_hashes:
            self.kv_event_queue.append(
                BlockRemoved(
                    block_hashes=[maybe_convert_block_hash(get_block_hash(block_hash))],
                    medium=MEDIUM_GPU,
                    group_idx=get_group_id(block_hash),
                )
            )

    def _insert_block_hash(
        self,
        block_hash_with_group_id: BlockHashWithGroupId,
        block: KVCacheBlock,
        num_tokens: int | None,
    ) -> None:
        if block.block_hash == block_hash_with_group_id:
            return

        if self.cached_block_hash_to_block.contain(
            block_hash_with_group_id, block.block_id
        ):
            return

        if block.block_hash is None:
            block.set_block_hash(block_hash_with_group_id, num_tokens=num_tokens)
        else:
            self.cached_block_hashes_by_block.setdefault(block.block_id, set()).add(
                block_hash_with_group_id
            )
        self.cached_block_hash_to_block.insert(block_hash_with_group_id, block)

    def move_block_hashes(
        self,
        src_block: KVCacheBlock,
        dst_block: KVCacheBlock,
    ) -> None:
        """Re-point ``src_block``'s prefix-cache entries to ``dst_block``.

        Used when the request owning ``src_block`` keeps writing into it
        : the prefix cache holds a private copy (``dst_block``)
        under the same hashes instead. Entries stay live; no events emitted.
        """
        assert dst_block.block_hash is None
        assert dst_block.block_id not in self.cached_block_hashes_by_block
        num_tokens = src_block.block_hash_num_tokens
        for block_hash in self._remove_cached_block_hashes(src_block):
            # `num_tokens` only applies to the first (primary) insertion.
            self._insert_block_hash(block_hash, dst_block, num_tokens=num_tokens)

    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
        """Get new blocks from the free block pool.

        Note that we do not check block cache in this function.

        Args:
            num_blocks: The number of blocks to allocate.

        Returns:
            A list of new block.
        """
        if num_blocks > self.get_num_free_blocks():
            raise ValueError(f"Cannot get {num_blocks} free blocks from the pool")

        ret: list[KVCacheBlock] = self.free_block_queue.popleft_n(num_blocks)

        # In order to only iterate the list once, we duplicated code a bit
        if self.enable_caching:
            for block in ret:
                self._maybe_evict_cached_block(block)
                assert block.ref_cnt == 0
                block.ref_cnt += 1
                if self.metrics_collector:
                    self.metrics_collector.on_block_allocated(block)
        else:
            for block in ret:
                assert block.ref_cnt == 0
                block.ref_cnt += 1
                if self.metrics_collector:
                    self.metrics_collector.on_block_allocated(block)
        return ret

    def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
        """
        If a block is cached in `cached_block_hash_to_block`, we reset its hash
        metadata and evict it from the cache.

        Args:
            block: The block to evict.

        Returns:
            True if the block is evicted, False otherwise.
        """
        # Clean up metrics tracking first to prevent leaks
        if self.metrics_collector:
            self.metrics_collector.on_block_evicted(block)

        evicted_hashes = self._remove_cached_block_hashes(block)
        if not evicted_hashes:
            # The block doesn't have hash, eviction is not needed
            return False

        self._emit_block_removed_events(evicted_hashes)
        return True

    def touch(self, blocks: Sequence[KVCacheBlock]) -> None:
        """Touch a block increases its reference count by 1, and may remove
        the block from the free queue. This is used when a block is hit by
        another request with the same prefix.

        Args:
            blocks: A list of blocks to touch.
        """
        for block in blocks:
            # ref_cnt=0 means this block is in the free list (i.e. eviction
            # candidate), so remove it.
            if block.ref_cnt == 0 and not block.is_null:
                self.free_block_queue.remove(block)
            block.ref_cnt += 1
            if self.metrics_collector:
                self.metrics_collector.on_block_accessed(block)

    def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
        """Free a list of blocks. The blocks should be ordered by their
        eviction priority, where the first block will be evicted first.

        Args:
            ordered_blocks: A list of blocks to free ordered by their eviction
                priority.
        """
        # Identify blocks with hash (LRU cache) and without it (will never match in APC)
        blocks_with_hash = []
        blocks_without_hash = []
        for block in ordered_blocks:
            block.ref_cnt -= 1
            if block.ref_cnt == 0 and not block.is_null:
                if block.block_hash is None:
                    blocks_without_hash.append(block)
                else:
                    blocks_with_hash.append(block)

        # Blocks without hash always get evicted first - prepend them last to the tail
        self.free_block_queue.prepend_n(blocks_without_hash)
        self.free_block_queue.append_n(blocks_with_hash)

    def evict_blocks(self, block_ids: set[int]) -> None:
        """evict blocks from the prefix cache by their block IDs.

        only evicts blocks that are currently cached (have a hash). blocks
        with ref_cnt > 0 are not freed from the block pool, only evicted
        from the prefix cache hash table.

        Args:
            block_ids: Set of block IDs to evict from cache.
        """
        for block_id in block_ids:
            assert block_id < len(self.blocks), (
                f"Invalid block_id {block_id} >= {len(self.blocks)}. "
                f"This indicates a bug in the KV connector - workers should "
                f"only report block IDs that were allocated by the scheduler."
            )
            block = self.blocks[block_id]
            self._maybe_evict_cached_block(block)

    def reset_prefix_cache(self) -> bool:
        """Reset prefix cache. This function may be used in RLHF
        flows to invalid prefix caching after the weights are updated,
        or used for resetting prefix caching status for benchmarking.

        Returns:
            bool: True if the prefix cache is successfully reset,
            False otherwise.
        """
        num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
        if num_used_blocks != 1:  # The null block is always marked as used
            logger.warning(
                "Failed to reset prefix cache because some "
                "blocks (%d) are not freed yet",
                num_used_blocks - 1,
            )
            return False

        # Remove all hashes so that no new blocks will hit.
        self.cached_block_hash_to_block = BlockHashToBlockMap()
        self.cached_block_hashes_by_block.clear()

        # Remove all hashes from all blocks.
        for block in self.blocks:
            block.reset_hash()

        if self.metrics_collector:
            self.metrics_collector.reset()

        logger.info("Successfully reset prefix cache")

        if self.enable_kv_cache_events:
            self.kv_event_queue.append(AllBlocksCleared())

        return True

    def get_num_free_blocks(self) -> int:
        """Get the number of free blocks in the pool.

        Returns:
            The number of free blocks.
        """
        return self.free_block_queue.num_free_blocks

    def get_usage(self) -> float:
        """Get the KV cache usage.

        Returns:
            The KV cache usage (between 0.0 and 1.0).
        """

        # Subtract 1 to account for null block.
        total_gpu_blocks = self.num_gpu_blocks - 1
        if not total_gpu_blocks:
            return 0
        return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks)

    def take_events(self) -> list[KVCacheEvent]:
        """Atomically takes all events and clears the queue.

        Returns:
            A list of KV cache events.
        """
        if not self.enable_kv_cache_events:
            return []
        events = self.kv_event_queue
        self.kv_event_queue = []
        return events

`_build_block_stored_event(request, block_hashes, parent_block_hash, start_token_idx, end_token_idx, block_size, kv_cache_group_id, extra_keys_list)` ¶

Build a BlockStored KV event for request.

Shared by cache_full_blocks (newly cached blocks) and emit_cached_block_events (prefix-cache-reused blocks) so both emit identical event shapes for downstream consumers.

Source code in vllm/v1/core/block_pool.py

def _build_block_stored_event(
    self,
    request: Request,
    block_hashes: list[ExternalBlockHash] | None,
    parent_block_hash: ExternalBlockHash | None,
    start_token_idx: int,
    end_token_idx: int,
    block_size: int,
    kv_cache_group_id: int,
    extra_keys_list: list[tuple[Any, ...] | None],
) -> BlockStored:
    """Build a ``BlockStored`` KV event for ``request``.

    Shared by ``cache_full_blocks`` (newly cached blocks) and
    ``emit_cached_block_events`` (prefix-cache-reused blocks) so both emit
    identical event shapes for downstream consumers.
    """
    return BlockStored(
        block_hashes=block_hashes,
        parent_block_hash=parent_block_hash,
        token_ids=request.all_token_ids[start_token_idx:end_token_idx],
        block_size=block_size,
        lora_id=request.lora_request.adapter_id if request.lora_request else None,
        medium=MEDIUM_GPU,
        lora_name=request.lora_request.name if request.lora_request else None,
        extra_keys=extra_keys_list if extra_keys_list else None,
        group_idx=kv_cache_group_id,
    )

`_maybe_evict_cached_block(block)` ¶

If a block is cached in cached_block_hash_to_block, we reset its hash metadata and evict it from the cache.

Parameters:

block ¶
(KVCacheBlock) –

The block to evict.

Returns:

bool –

True if the block is evicted, False otherwise.

Source code in vllm/v1/core/block_pool.py

def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
    """
    If a block is cached in `cached_block_hash_to_block`, we reset its hash
    metadata and evict it from the cache.

    Args:
        block: The block to evict.

    Returns:
        True if the block is evicted, False otherwise.
    """
    # Clean up metrics tracking first to prevent leaks
    if self.metrics_collector:
        self.metrics_collector.on_block_evicted(block)

    evicted_hashes = self._remove_cached_block_hashes(block)
    if not evicted_hashes:
        # The block doesn't have hash, eviction is not needed
        return False

    self._emit_block_removed_events(evicted_hashes)
    return True

`cache_full_blocks(request, blocks, num_cached_blocks, num_full_blocks, block_size, kv_cache_group_id, block_mask=None)` ¶

Cache a list of full blocks for prefix caching. This function takes a list of blocks that will have their block hash metadata to be updated and cached. Given a request, it updates the metadata for each block and caching it in the cached_block_hash_to_block. The block hashes values are computed by the Request object immediately when it is created and when new tokens are appended.

Parameters:

request ¶
(Request) –

The request to cache the blocks.
blocks ¶
(list[KVCacheBlock]) –

All blocks in the request.
num_cached_blocks ¶
(int) –

The number of blocks that are already cached.
num_full_blocks ¶
(int) –

The number of blocks that are full and should be cached after this function.
block_size ¶
(int) –

Number of tokens in each block.
kv_cache_group_id ¶
(int) –

The id of the KV cache group.
block_mask ¶
(list[bool] | None, default: None ) –

Optional mask aligned with blocks[num_cached_blocks:num_full_blocks]. When provided, blocks where the mask is False are skipped (treated like null blocks). Used by groups whose find_longest_cache_hit only consults a subset of blocks (e.g. SWA tail-window), so blocks that can never serve a hit stay out of the prefix-cache hash map.

Source code in vllm/v1/core/block_pool.py

def cache_full_blocks(
    self,
    request: Request,
    blocks: list[KVCacheBlock],
    num_cached_blocks: int,
    num_full_blocks: int,
    block_size: int,
    kv_cache_group_id: int,
    block_mask: list[bool] | None = None,
) -> None:
    """Cache a list of full blocks for prefix caching.
    This function takes a list of blocks that will have their block hash
    metadata to be updated and cached. Given a request, it updates the
    metadata for each block and caching it in the
    `cached_block_hash_to_block`.
    The block hashes values are computed by the Request object immediately
    when it is created and when new tokens are appended.

    Args:
        request: The request to cache the blocks.
        blocks: All blocks in the request.
        num_cached_blocks: The number of blocks that are already cached.
        num_full_blocks: The number of blocks that are full and should
            be cached after this function.
        block_size: Number of tokens in each block.
        kv_cache_group_id: The id of the KV cache group.
        block_mask: Optional mask aligned with
            ``blocks[num_cached_blocks:num_full_blocks]``. When provided,
            blocks where the mask is False are skipped (treated like null
            blocks). Used by groups whose ``find_longest_cache_hit`` only
            consults a subset of blocks (e.g. SWA tail-window), so blocks
            that can never serve a hit stay out of the prefix-cache hash
            map.
    """
    if num_cached_blocks >= num_full_blocks:
        return
    new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
    assert block_mask is None or len(block_mask) == len(new_full_blocks)
    block_hashes = resolve_block_hashes(
        request.block_hashes, self.hash_block_size, block_size
    )

    new_block_hashes = block_hashes[num_cached_blocks:]
    new_hashes: list[ExternalBlockHash] | None = (
        [] if self.enable_kv_cache_events else None
    )
    for i, blk in enumerate(new_full_blocks):
        # Some blocks may be null or masked out when enabling sparse attention
        # like sliding window attention, or Mamba models with prefix-caching
        # in align mode. We skip null blocks here.
        if blk.is_null or (block_mask is not None and not block_mask[i]):
            continue
        block_hash = new_block_hashes[i]
        num_hash_tokens = (num_cached_blocks + i + 1) * block_size

        # Update and added the full block to the cache.
        block_hash_with_group_id = make_block_hash_with_group_id(
            block_hash, kv_cache_group_id
        )
        if blk.block_hash is not None:
            # The only valid case where a "new full block" already has a
            # hash is partial->full promotion of the same cache block.
            assert (
                blk.block_hash_num_tokens is not None
                and blk.block_hash_num_tokens < num_hash_tokens
            )
            removed_hashes = self._remove_cached_block_hashes(blk)
            self._emit_block_removed_events(removed_hashes)
        self._insert_block_hash(
            block_hash_with_group_id,
            blk,
            num_tokens=num_hash_tokens,
        )
        if new_hashes is not None:
            new_hashes.append(maybe_convert_block_hash(block_hash))

    if self.enable_kv_cache_events:
        if num_cached_blocks == 0:
            parent_block_hash: ExternalBlockHash | None = None
        else:
            parent_block_hash = maybe_convert_block_hash(
                block_hashes[num_cached_blocks - 1]
            )

        # Calculate token range for the blocks being cached
        start_token_idx = num_cached_blocks * block_size
        end_token_idx = num_full_blocks * block_size

        # Generate extra keys for each block individually.
        # Each block may have different extra_keys (e.g., different MM
        # features, or cache_salt only for the first block).
        # Skip null/masked-out blocks to match the length of new_hashes.
        extra_keys_list: list[tuple[Any, ...] | None] = []
        curr_mm_idx = 0
        for i in range(num_cached_blocks, num_full_blocks):
            if blocks[i].is_null:
                continue
            if block_mask is not None and not block_mask[i - num_cached_blocks]:
                continue
            block_start = i * block_size
            block_end = block_start + block_size
            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                request, block_start, block_end, curr_mm_idx
            )
            extra_keys_list.append(extra_keys)

        self.kv_event_queue.append(
            self._build_block_stored_event(
                request,
                block_hashes=new_hashes,
                parent_block_hash=parent_block_hash,
                start_token_idx=start_token_idx,
                end_token_idx=end_token_idx,
                block_size=block_size,
                kv_cache_group_id=kv_cache_group_id,
                extra_keys_list=extra_keys_list,
            )
        )

`cache_partial_block(request, block, num_tokens, kv_cache_group_id, block_size)` ¶

Register a partial prefix-cache entry for an existing block.

Prefix-cache keys normally identify full cache blocks. A partial entry makes an existing cache block reachable from a fine-grained prefix boundary inside that block without allocating or copying a new KVCacheBlock.

The partial entry is lookup metadata owned by block. If block has no primary hash, the key becomes its primary hash. If the block already has a primary hash, the partial entry is tracked in cached_block_hashes_by_block so eviction, reset, and promotion can remove every hash key that points to the block.

Parameters:

request ¶
(Request) –

Request whose token IDs and block hashes define the partial entry.
block ¶
(KVCacheBlock) –

Existing cache block to make reachable from the partial prefix boundary.
num_tokens ¶
(int) –

Prefix length represented by the partial entry. It must be a positive multiple of self.hash_block_size and cannot exceed the request's computed block hashes.
kv_cache_group_id ¶
(int) –

KV cache group that owns the partial entry.
block_size ¶
(int) –

Cache block size for the owning group. The partial entry hash itself is always the prefix-chain hash at num_tokens; block_size is used to assert that the entry is partial within the owning cache block.

Returns:

BlockHashWithGroupId | None –

The hash key with group ID if a partial entry can be registered;
BlockHashWithGroupId | None –

otherwise None for null blocks.

Source code in vllm/v1/core/block_pool.py

def cache_partial_block(
    self,
    request: Request,
    block: KVCacheBlock,
    num_tokens: int,
    kv_cache_group_id: int,
    block_size: int,
) -> BlockHashWithGroupId | None:
    """Register a partial prefix-cache entry for an existing block.

    Prefix-cache keys normally identify full cache blocks. A partial entry
    makes an existing cache block reachable from a fine-grained prefix
    boundary inside that block without allocating or copying a new
    ``KVCacheBlock``.

    The partial entry is lookup metadata owned by ``block``. If ``block``
    has no primary hash, the key becomes its primary hash. If the block
    already has a primary hash, the partial entry is tracked in
    ``cached_block_hashes_by_block`` so eviction, reset, and promotion can
    remove every hash key that points to the block.

    Args:
        request: Request whose token IDs and block hashes define the
            partial entry.
        block: Existing cache block to make reachable from the partial
            prefix boundary.
        num_tokens: Prefix length represented by the partial entry. It
            must be a positive multiple of ``self.hash_block_size`` and
            cannot exceed the request's computed block hashes.
        kv_cache_group_id: KV cache group that owns the partial entry.
        block_size: Cache block size for the owning group. The partial
            entry hash itself is always the prefix-chain hash at
            ``num_tokens``; ``block_size`` is used to assert that the
            entry is partial within the owning cache block.

    Returns:
        The hash key with group ID if a partial entry can be registered;
        otherwise ``None`` for null blocks.
    """
    if block.is_null:
        return None

    assert block_size > self.hash_block_size
    assert block_size % self.hash_block_size == 0
    assert num_tokens % block_size != 0
    block_hash = self._get_partial_block_hash(request, num_tokens)
    num_hash_blocks = num_tokens // self.hash_block_size
    block_hash_with_group_id = make_block_hash_with_group_id(
        block_hash, kv_cache_group_id
    )
    already_cached = block.block_hash == block_hash_with_group_id or (
        self.cached_block_hash_to_block.contain(
            block_hash_with_group_id, block.block_id
        )
    )
    if (
        not already_cached
        and block.block_hash is not None
        and block.block_hash_num_tokens is not None
        and block.block_hash_num_tokens < num_hash_blocks * self.hash_block_size
    ):
        removed_hashes = self._remove_cached_block_hashes(block)
        self._emit_block_removed_events(removed_hashes)
    self._insert_block_hash(
        block_hash_with_group_id,
        block,
        num_tokens=num_hash_blocks * self.hash_block_size,
    )
    if self.enable_kv_cache_events and not already_cached:
        parent_hash, block_start = self._get_partial_block_parent_hash_and_start(
            request, num_tokens
        )
        parent_block_hash = (
            maybe_convert_block_hash(parent_hash)
            if parent_hash is not None
            else None
        )
        block_end = num_tokens
        curr_mm_idx = -1 if block_start > 0 else 0
        extra_keys, _ = generate_block_hash_extra_keys(
            request, block_start, block_end, curr_mm_idx
        )
        self.kv_event_queue.append(
            BlockStored(
                block_hashes=[maybe_convert_block_hash(block_hash)],
                parent_block_hash=parent_block_hash,
                token_ids=request.all_token_ids[block_start:block_end],
                block_size=block_end - block_start,
                lora_id=request.lora_request.adapter_id
                if request.lora_request
                else None,
                medium=MEDIUM_GPU,
                lora_name=request.lora_request.name
                if request.lora_request
                else None,
                extra_keys=[extra_keys],
                group_idx=kv_cache_group_id,
            )
        )
    return block_hash_with_group_id

`emit_cached_block_events(request, num_cached_blocks, block_size, kv_cache_group_id)` ¶

Generate BlockStored events for blocks reused from prefix cache.

Unlike cache_full_blocks(), this does NOT modify block state — the blocks are already cached. It only generates events so that external consumers (e.g. gateway) can learn about reused blocks.

Parameters:

request ¶
(Request) –

The request whose prefix cache blocks were reused.
num_cached_blocks ¶
(int) –

Number of blocks that were cache hits.
block_size ¶
(int) –

Number of tokens per block.
kv_cache_group_id ¶
(int) –

The KV cache group ID.

Source code in vllm/v1/core/block_pool.py

def emit_cached_block_events(
    self,
    request: Request,
    num_cached_blocks: int,
    block_size: int,
    kv_cache_group_id: int,
) -> None:
    """Generate BlockStored events for blocks reused from prefix cache.

    Unlike cache_full_blocks(), this does NOT modify block state —
    the blocks are already cached. It only generates events so that
    external consumers (e.g. gateway) can learn about reused blocks.

    Args:
        request: The request whose prefix cache blocks were reused.
        num_cached_blocks: Number of blocks that were cache hits.
        block_size: Number of tokens per block.
        kv_cache_group_id: The KV cache group ID.
    """
    if not self.enable_kv_cache_events or num_cached_blocks == 0:
        return

    block_hashes = resolve_block_hashes(
        request.block_hashes, self.hash_block_size, block_size
    )

    # Collect external hashes and extra_keys for cached blocks.
    cached_hashes: list[ExternalBlockHash] = []
    extra_keys_list: list[tuple[Any, ...] | None] = []
    curr_mm_idx = 0
    for i in range(num_cached_blocks):
        block_start = i * block_size
        block_end = block_start + block_size
        cached_hashes.append(maybe_convert_block_hash(block_hashes[i]))
        extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
            request, block_start, block_end, curr_mm_idx
        )
        extra_keys_list.append(extra_keys)

    if not cached_hashes:
        return

    # Prefix-cache hits always form a contiguous prefix starting at block 0,
    # so the first (and thus the whole group's) parent block hash is None.
    parent_block_hash: ExternalBlockHash | None = None
    start_token_idx = 0
    end_token_idx = num_cached_blocks * block_size

    logger.debug(
        "EmitCachedBlock event: block_size=%d, "
        "num_cached_blocks=%d, parent_block_hash=%s, "
        "token_ids_len=%d, group_idx=%s",
        block_size,
        num_cached_blocks,
        parent_block_hash,
        len(request.all_token_ids[start_token_idx:end_token_idx]),
        kv_cache_group_id,
    )

    self.kv_event_queue.append(
        self._build_block_stored_event(
            request,
            block_hashes=cached_hashes,
            parent_block_hash=parent_block_hash,
            start_token_idx=start_token_idx,
            end_token_idx=end_token_idx,
            block_size=block_size,
            kv_cache_group_id=kv_cache_group_id,
            extra_keys_list=extra_keys_list,
        )
    )

`evict_blocks(block_ids)` ¶

evict blocks from the prefix cache by their block IDs.

only evicts blocks that are currently cached (have a hash). blocks with ref_cnt > 0 are not freed from the block pool, only evicted from the prefix cache hash table.

Parameters:

block_ids ¶
(set[int]) –

Set of block IDs to evict from cache.

Source code in vllm/v1/core/block_pool.py

def evict_blocks(self, block_ids: set[int]) -> None:
    """evict blocks from the prefix cache by their block IDs.

    only evicts blocks that are currently cached (have a hash). blocks
    with ref_cnt > 0 are not freed from the block pool, only evicted
    from the prefix cache hash table.

    Args:
        block_ids: Set of block IDs to evict from cache.
    """
    for block_id in block_ids:
        assert block_id < len(self.blocks), (
            f"Invalid block_id {block_id} >= {len(self.blocks)}. "
            f"This indicates a bug in the KV connector - workers should "
            f"only report block IDs that were allocated by the scheduler."
        )
        block = self.blocks[block_id]
        self._maybe_evict_cached_block(block)

`free_blocks(ordered_blocks)` ¶

Free a list of blocks. The blocks should be ordered by their eviction priority, where the first block will be evicted first.

Parameters:

ordered_blocks ¶
(Iterable[KVCacheBlock]) –

A list of blocks to free ordered by their eviction priority.

Source code in vllm/v1/core/block_pool.py

def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
    """Free a list of blocks. The blocks should be ordered by their
    eviction priority, where the first block will be evicted first.

    Args:
        ordered_blocks: A list of blocks to free ordered by their eviction
            priority.
    """
    # Identify blocks with hash (LRU cache) and without it (will never match in APC)
    blocks_with_hash = []
    blocks_without_hash = []
    for block in ordered_blocks:
        block.ref_cnt -= 1
        if block.ref_cnt == 0 and not block.is_null:
            if block.block_hash is None:
                blocks_without_hash.append(block)
            else:
                blocks_with_hash.append(block)

    # Blocks without hash always get evicted first - prepend them last to the tail
    self.free_block_queue.prepend_n(blocks_without_hash)
    self.free_block_queue.append_n(blocks_with_hash)

`get_cached_block(block_hash, kv_cache_group_ids)` ¶

Get the cached block by the block hash for each group in kv_cache_group_ids, or None if cache miss for any group. If there are duplicated blocks, we return the first block in the cache.

Parameters:

block_hash ¶
(BlockHash) –

The hash value of the block.
kv_cache_group_ids ¶
(list[int]) –

The ids of the KV cache groups.

Returns:

list[KVCacheBlock] | None –

The cached blocks if exists, or None.

Source code in vllm/v1/core/block_pool.py

def get_cached_block(
    self, block_hash: BlockHash, kv_cache_group_ids: list[int]
) -> list[KVCacheBlock] | None:
    """Get the cached block by the block hash for each group in
    `kv_cache_group_ids`, or None if cache miss for any group.
    If there are duplicated blocks, we return the first block in the cache.

    Args:
        block_hash: The hash value of the block.
        kv_cache_group_ids: The ids of the KV cache groups.

    Returns:
        The cached blocks if exists, or None.
    """
    cached_blocks = []
    for group_id in kv_cache_group_ids:
        block_hash_with_group_id = make_block_hash_with_group_id(
            block_hash, group_id
        )
        block = self.cached_block_hash_to_block.get_one_block(
            block_hash_with_group_id
        )
        if not block:
            return None
        cached_blocks.append(block)
    return cached_blocks

`get_new_blocks(num_blocks)` ¶

Get new blocks from the free block pool.

Note that we do not check block cache in this function.

Parameters:

num_blocks ¶
(int) –

The number of blocks to allocate.

Returns:

list[KVCacheBlock] –

A list of new block.

Source code in vllm/v1/core/block_pool.py

def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
    """Get new blocks from the free block pool.

    Note that we do not check block cache in this function.

    Args:
        num_blocks: The number of blocks to allocate.

    Returns:
        A list of new block.
    """
    if num_blocks > self.get_num_free_blocks():
        raise ValueError(f"Cannot get {num_blocks} free blocks from the pool")

    ret: list[KVCacheBlock] = self.free_block_queue.popleft_n(num_blocks)

    # In order to only iterate the list once, we duplicated code a bit
    if self.enable_caching:
        for block in ret:
            self._maybe_evict_cached_block(block)
            assert block.ref_cnt == 0
            block.ref_cnt += 1
            if self.metrics_collector:
                self.metrics_collector.on_block_allocated(block)
    else:
        for block in ret:
            assert block.ref_cnt == 0
            block.ref_cnt += 1
            if self.metrics_collector:
                self.metrics_collector.on_block_allocated(block)
    return ret

`get_num_free_blocks()` ¶

Get the number of free blocks in the pool.

Returns:

int –

The number of free blocks.

Source code in vllm/v1/core/block_pool.py

def get_num_free_blocks(self) -> int:
    """Get the number of free blocks in the pool.

    Returns:
        The number of free blocks.
    """
    return self.free_block_queue.num_free_blocks

`get_usage()` ¶

Get the KV cache usage.

Returns:

float –

The KV cache usage (between 0.0 and 1.0).

Source code in vllm/v1/core/block_pool.py

def get_usage(self) -> float:
    """Get the KV cache usage.

    Returns:
        The KV cache usage (between 0.0 and 1.0).
    """

    # Subtract 1 to account for null block.
    total_gpu_blocks = self.num_gpu_blocks - 1
    if not total_gpu_blocks:
        return 0
    return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks)

`move_block_hashes(src_block, dst_block)` ¶

Re-point src_block's prefix-cache entries to dst_block.

Used when the request owning src_block keeps writing into it: the prefix cache holds a private copy (dst_block) under the same hashes instead. Entries stay live; no events emitted.

Source code in vllm/v1/core/block_pool.py

def move_block_hashes(
    self,
    src_block: KVCacheBlock,
    dst_block: KVCacheBlock,
) -> None:
    """Re-point ``src_block``'s prefix-cache entries to ``dst_block``.

    Used when the request owning ``src_block`` keeps writing into it
    : the prefix cache holds a private copy (``dst_block``)
    under the same hashes instead. Entries stay live; no events emitted.
    """
    assert dst_block.block_hash is None
    assert dst_block.block_id not in self.cached_block_hashes_by_block
    num_tokens = src_block.block_hash_num_tokens
    for block_hash in self._remove_cached_block_hashes(src_block):
        # `num_tokens` only applies to the first (primary) insertion.
        self._insert_block_hash(block_hash, dst_block, num_tokens=num_tokens)

`reset_prefix_cache()` ¶

Reset prefix cache. This function may be used in RLHF flows to invalid prefix caching after the weights are updated, or used for resetting prefix caching status for benchmarking.

Returns:

bool ( bool ) –

True if the prefix cache is successfully reset,
bool –

False otherwise.

Source code in vllm/v1/core/block_pool.py

def reset_prefix_cache(self) -> bool:
    """Reset prefix cache. This function may be used in RLHF
    flows to invalid prefix caching after the weights are updated,
    or used for resetting prefix caching status for benchmarking.

    Returns:
        bool: True if the prefix cache is successfully reset,
        False otherwise.
    """
    num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
    if num_used_blocks != 1:  # The null block is always marked as used
        logger.warning(
            "Failed to reset prefix cache because some "
            "blocks (%d) are not freed yet",
            num_used_blocks - 1,
        )
        return False

    # Remove all hashes so that no new blocks will hit.
    self.cached_block_hash_to_block = BlockHashToBlockMap()
    self.cached_block_hashes_by_block.clear()

    # Remove all hashes from all blocks.
    for block in self.blocks:
        block.reset_hash()

    if self.metrics_collector:
        self.metrics_collector.reset()

    logger.info("Successfully reset prefix cache")

    if self.enable_kv_cache_events:
        self.kv_event_queue.append(AllBlocksCleared())

    return True

`take_events()` ¶

Atomically takes all events and clears the queue.

Returns:

list[KVCacheEvent] –

A list of KV cache events.

Source code in vllm/v1/core/block_pool.py

def take_events(self) -> list[KVCacheEvent]:
    """Atomically takes all events and clears the queue.

    Returns:
        A list of KV cache events.
    """
    if not self.enable_kv_cache_events:
        return []
    events = self.kv_event_queue
    self.kv_event_queue = []
    return events

`touch(blocks)` ¶

Touch a block increases its reference count by 1, and may remove the block from the free queue. This is used when a block is hit by another request with the same prefix.

Parameters:

blocks ¶
(Sequence[KVCacheBlock]) –

A list of blocks to touch.

Source code in vllm/v1/core/block_pool.py

def touch(self, blocks: Sequence[KVCacheBlock]) -> None:
    """Touch a block increases its reference count by 1, and may remove
    the block from the free queue. This is used when a block is hit by
    another request with the same prefix.

    Args:
        blocks: A list of blocks to touch.
    """
    for block in blocks:
        # ref_cnt=0 means this block is in the free list (i.e. eviction
        # candidate), so remove it.
        if block.ref_cnt == 0 and not block.is_null:
            self.free_block_queue.remove(block)
        block.ref_cnt += 1
        if self.metrics_collector:
            self.metrics_collector.on_block_accessed(block)

vllm.v1.core.block_pool ¶

BlockHashToBlockMap ¶

contain(key, block_id) ¶

get_one_block(key) ¶

insert(key, block) ¶

pop(key, block_id) ¶

BlockPool ¶

num_gpu_blocks ¶

enable_caching ¶

hash_block_size ¶

enable_kv_cache_events ¶

metrics_collector ¶

_build_block_stored_event(request, block_hashes, parent_block_hash, start_token_idx, end_token_idx, block_size, kv_cache_group_id, extra_keys_list) ¶

_maybe_evict_cached_block(block) ¶

block ¶

cache_full_blocks(request, blocks, num_cached_blocks, num_full_blocks, block_size, kv_cache_group_id, block_mask=None) ¶

request ¶

blocks ¶

num_cached_blocks ¶

num_full_blocks ¶

block_size ¶

kv_cache_group_id ¶

block_mask ¶

cache_partial_block(request, block, num_tokens, kv_cache_group_id, block_size) ¶

request ¶

block ¶

num_tokens ¶

kv_cache_group_id ¶

block_size ¶

emit_cached_block_events(request, num_cached_blocks, block_size, kv_cache_group_id) ¶

request ¶

num_cached_blocks ¶

block_size ¶

kv_cache_group_id ¶

evict_blocks(block_ids) ¶

block_ids ¶

free_blocks(ordered_blocks) ¶

ordered_blocks ¶

get_cached_block(block_hash, kv_cache_group_ids) ¶

block_hash ¶

kv_cache_group_ids ¶

get_new_blocks(num_blocks) ¶

num_blocks ¶

get_num_free_blocks() ¶

get_usage() ¶

move_block_hashes(src_block, dst_block) ¶

reset_prefix_cache() ¶

take_events() ¶

touch(blocks) ¶

blocks ¶

`vllm.v1.core.block_pool` ¶

`BlockHashToBlockMap` ¶

`contain(key, block_id)` ¶

`get_one_block(key)` ¶

`insert(key, block)` ¶

`pop(key, block_id)` ¶

`BlockPool` ¶

`num_gpu_blocks` ¶

`enable_caching` ¶

`hash_block_size` ¶

`enable_kv_cache_events` ¶

`metrics_collector` ¶

`_build_block_stored_event(request, block_hashes, parent_block_hash, start_token_idx, end_token_idx, block_size, kv_cache_group_id, extra_keys_list)` ¶

`_maybe_evict_cached_block(block)` ¶

`block` ¶

`cache_full_blocks(request, blocks, num_cached_blocks, num_full_blocks, block_size, kv_cache_group_id, block_mask=None)` ¶

`request` ¶

`blocks` ¶

`num_cached_blocks` ¶

`num_full_blocks` ¶

`block_size` ¶

`kv_cache_group_id` ¶

`block_mask` ¶

`cache_partial_block(request, block, num_tokens, kv_cache_group_id, block_size)` ¶

`request` ¶

`block` ¶

`num_tokens` ¶

`kv_cache_group_id` ¶

`block_size` ¶

`emit_cached_block_events(request, num_cached_blocks, block_size, kv_cache_group_id)` ¶

`request` ¶

`num_cached_blocks` ¶

`block_size` ¶

`kv_cache_group_id` ¶

`evict_blocks(block_ids)` ¶

`block_ids` ¶

`free_blocks(ordered_blocks)` ¶

`ordered_blocks` ¶

`get_cached_block(block_hash, kv_cache_group_ids)` ¶

`block_hash` ¶

`kv_cache_group_ids` ¶

`get_new_blocks(num_blocks)` ¶

`num_blocks` ¶

`get_num_free_blocks()` ¶

`get_usage()` ¶

`move_block_hashes(src_block, dst_block)` ¶

`reset_prefix_cache()` ¶

`take_events()` ¶

`touch(blocks)` ¶

`blocks` ¶