Skip to content

vllm.v1.worker.gpu.lora_utils

LoRA utilities for the Model Runner V2 and cudagraph.

Functions:

create_lora_capture_hook(lora_config, runner)

Create a hook to set up LoRA state before each cudagraph capture.

Source code in vllm/v1/worker/gpu/lora_utils.py
def create_lora_capture_hook(
    lora_config: "LoRAConfig | None",
    runner: Any,
) -> Callable[[int, int, int], None] | None:
    """Create a hook to set up LoRA state before each cudagraph capture."""
    if lora_config is None:
        return None

    def hook(num_active_loras: int, num_reqs: int, num_tokens: int) -> None:
        num_scheduled = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
        num_scheduled[-1] += num_tokens % num_reqs
        with runner.maybe_select_dummy_loras(
            lora_config, num_scheduled, num_active_loras=num_active_loras
        ):
            pass

    return hook

get_lora_capture_cases(lora_config, compilation_config)

Return num_active_loras values for cudagraph capture.

When cudagraph_specialize_lora=True: powers of 2 up to max_loras, plus max_loras+1. When False: [0, max_loras+1]. When LoRA disabled: [0].

Source code in vllm/v1/worker/gpu/lora_utils.py
def get_lora_capture_cases(
    lora_config: "LoRAConfig | None",
    compilation_config: "CompilationConfig",
) -> list[int]:
    """
    Return num_active_loras values for cudagraph capture.

    When cudagraph_specialize_lora=True: powers of 2 up to max_loras, plus
    max_loras+1. When False: [0, max_loras+1]. When LoRA disabled: [0].
    """
    if lora_config is None:
        return [0]
    if compilation_config.cudagraph_specialize_lora:
        specialize = getattr(lora_config, "specialize_active_lora", False)
        captured = get_captured_lora_counts(lora_config.max_loras, specialize)
        return [0] + [c for c in captured if c > 0]
    return [0, lora_config.max_loras + 1]

get_num_active_loras_for_dispatch(lora_config, lora_state, req_ids, dummy_run)

Compute num_active_loras for cudagraph dispatch.

Source code in vllm/v1/worker/gpu/lora_utils.py
def get_num_active_loras_for_dispatch(
    lora_config: "LoRAConfig | None",
    lora_state: "LoraState",
    req_ids: list[str],
    dummy_run: bool,
) -> int:
    """Compute num_active_loras for cudagraph dispatch."""
    if lora_config and not dummy_run:
        return len(lora_state.get_activate_loras(req_ids))
    if dummy_run and lora_config:
        return lora_config.max_loras + 1
    return 0