Skip to content

vllm.multimodal.gpu_ipc_memory

Admission control for frontend GPU-side multimodal work.

When multimodal media is decoded on the GPU in the API-server (frontend) process, the decoded buffers compete for the same device memory that the engine reserves for weights, activations, and the KV cache. To keep the frontend's GPU usage within a sequestered budget (see MultiModalConfig.mm_ipc_gpu_memory_gb), decode paths acquire the number of bytes they need from a process-global :class:MultiModalGPUMemoryPool before allocating on the device and release them once the device memory is freed.

The pool is a simple byte-counting semaphore: acquire blocks until enough budget is free, so concurrent requests serialize rather than oversubscribe the GPU. It lives only in the frontend process; the engine carves the matching amount out of its KV-cache budget so the headroom physically exists.

Classes:

Functions:

MultiModalGPUMemoryLease

A handle for bytes acquired from a :class:MultiModalGPUMemoryPool.

Releasing is idempotent and the lease doubles as a context manager so the budget is returned even if the decode raises.

Source code in vllm/multimodal/gpu_ipc_memory.py
class MultiModalGPUMemoryLease:
    """A handle for bytes acquired from a :class:`MultiModalGPUMemoryPool`.

    Releasing is idempotent and the lease doubles as a context manager so the
    budget is returned even if the decode raises.
    """

    def __init__(self, pool: "MultiModalGPUMemoryPool", lease_id: int, nbytes: int):
        self.lease_id = lease_id
        self.nbytes = nbytes
        self._pool = pool

    def release(self) -> None:
        self._pool._release(self)

    def __enter__(self) -> "MultiModalGPUMemoryLease":
        return self

    def __exit__(self, *exc_info) -> None:
        self.release()

MultiModalGPUMemoryPool

Blocking byte-counting semaphore for frontend GPU multimodal memory.

Thread-safe in both directions: acquire (blocking) and release are typically called from the renderer's multimodal executor threads.

Methods:

  • acquire

    Reserve nbytes, blocking until that much budget is free.

Source code in vllm/multimodal/gpu_ipc_memory.py
class MultiModalGPUMemoryPool:
    """Blocking byte-counting semaphore for frontend GPU multimodal memory.

    Thread-safe in both directions: ``acquire`` (blocking) and ``release`` are
    typically called from the renderer's multimodal executor threads.
    """

    def __init__(self, total_bytes: int):
        if total_bytes <= 0:
            raise ValueError(f"total_bytes must be positive, got {total_bytes}")
        self._total_bytes = total_bytes
        self._available = total_bytes
        self._cond = threading.Condition()
        self._next_lease_id = 0
        # Outstanding lease ids, so a double release is a no-op.
        self._outstanding: set[int] = set()

    @property
    def total_bytes(self) -> int:
        return self._total_bytes

    @property
    def available_bytes(self) -> int:
        with self._cond:
            return self._available

    def acquire(self, nbytes: int) -> MultiModalGPUMemoryLease:
        """Reserve ``nbytes``, blocking until that much budget is free.

        Raises ``ValueError`` if ``nbytes`` exceeds the pool's total capacity,
        since such a request could never be satisfied.
        """
        if nbytes < 0:
            raise ValueError(f"Cannot acquire negative bytes: {nbytes}")
        if nbytes > self._total_bytes:
            raise ValueError(
                f"Multimodal GPU decode requested {nbytes} bytes, which exceeds "
                f"the total pool size of {self._total_bytes} bytes. Increase "
                f"--mm-ipc-gpu-memory-gb or reduce the multimodal input size."
            )
        with self._cond:
            while self._available < nbytes:
                self._cond.wait()
            self._available -= nbytes
            lease_id = self._next_lease_id
            self._next_lease_id += 1
            self._outstanding.add(lease_id)
        return MultiModalGPUMemoryLease(self, lease_id, nbytes)

    def _release(self, lease: MultiModalGPUMemoryLease) -> None:
        with self._cond:
            if lease.lease_id not in self._outstanding:
                # Already released — idempotent.
                return
            self._outstanding.discard(lease.lease_id)
            self._available += lease.nbytes
            self._cond.notify_all()

acquire(nbytes)

Reserve nbytes, blocking until that much budget is free.

Raises ValueError if nbytes exceeds the pool's total capacity, since such a request could never be satisfied.

Source code in vllm/multimodal/gpu_ipc_memory.py
def acquire(self, nbytes: int) -> MultiModalGPUMemoryLease:
    """Reserve ``nbytes``, blocking until that much budget is free.

    Raises ``ValueError`` if ``nbytes`` exceeds the pool's total capacity,
    since such a request could never be satisfied.
    """
    if nbytes < 0:
        raise ValueError(f"Cannot acquire negative bytes: {nbytes}")
    if nbytes > self._total_bytes:
        raise ValueError(
            f"Multimodal GPU decode requested {nbytes} bytes, which exceeds "
            f"the total pool size of {self._total_bytes} bytes. Increase "
            f"--mm-ipc-gpu-memory-gb or reduce the multimodal input size."
        )
    with self._cond:
        while self._available < nbytes:
            self._cond.wait()
        self._available -= nbytes
        lease_id = self._next_lease_id
        self._next_lease_id += 1
        self._outstanding.add(lease_id)
    return MultiModalGPUMemoryLease(self, lease_id, nbytes)

get_mm_gpu_ipc_pool()

Return the process-global pool, or None when gating is disabled.

Source code in vllm/multimodal/gpu_ipc_memory.py
def get_mm_gpu_ipc_pool() -> MultiModalGPUMemoryPool | None:
    """Return the process-global pool, or ``None`` when gating is disabled."""
    return _GLOBAL_POOL

maybe_init_mm_gpu_ipc_pool(mm_ipc_gpu_memory_gb, api_process_count=1)

Create and install the global pool from the configured GiB budget.

Returns None (and leaves gating disabled) when the budget is 0. When multiple API-server processes share one engine, each process gets an equal slice of the user-provided frontend budget.

Source code in vllm/multimodal/gpu_ipc_memory.py
def maybe_init_mm_gpu_ipc_pool(
    mm_ipc_gpu_memory_gb: float,
    api_process_count: int = 1,
) -> MultiModalGPUMemoryPool | None:
    """Create and install the global pool from the configured GiB budget.

    Returns ``None`` (and leaves gating disabled) when the budget is 0. When
    multiple API-server processes share one engine, each process gets an equal
    slice of the user-provided frontend budget.
    """
    if mm_ipc_gpu_memory_gb <= 0:
        set_mm_gpu_ipc_pool(None)
        return None
    if api_process_count <= 0:
        raise ValueError(f"api_process_count must be positive, got {api_process_count}")
    total_bytes = int(mm_ipc_gpu_memory_gb * GiB_bytes) // api_process_count
    pool = MultiModalGPUMemoryPool(total_bytes)
    set_mm_gpu_ipc_pool(pool)
    logger.info(
        "Initialized multimodal GPU IPC memory pool with %d bytes for this API "
        "process (%.2f GiB total budget across %d API process(es)).",
        total_bytes,
        mm_ipc_gpu_memory_gb,
        api_process_count,
    )
    return pool

set_mm_gpu_ipc_pool(pool)

Install the process-global pool (frontend process only).

Source code in vllm/multimodal/gpu_ipc_memory.py
def set_mm_gpu_ipc_pool(pool: MultiModalGPUMemoryPool | None) -> None:
    """Install the process-global pool (frontend process only)."""
    global _GLOBAL_POOL
    _GLOBAL_POOL = pool