Skip to content

vllm.utils.jit_monitor

Monitor unexpected kernel JIT compilation during inference.

After server warmup completes, any kernel JIT compilation or autotuning event indicates a cache miss or unexpected input shape that causes a latency spike. This module registers hooks in supported runtimes to detect such events so they can be investigated.

Set --jit-monitor-mode=error to fail fast on unexpected runtime compilation. Set --jit-monitor-verbose to log every JIT compile with additional runtime details. Verbose logging is intentionally opt-in because it can emit many logs and add overhead.

Currently monitors: - CuTeDSL cute.compile calls - Triton @triton.autotune cache misses (via knobs.autotuning.print) - Triton @triton.jit first-time compilations (via knobs.runtime.jit_post_compile_hook)

Functions:

  • activate

    Enable JIT compilation monitoring after warmup.

  • is_active

    Return whether the JIT compilation monitor is currently active.

_setup_cutedsl_jit_hook()

Wrap cutlass.cute.compile to warn on compilation.

Source code in vllm/utils/jit_monitor.py
def _setup_cutedsl_jit_hook() -> None:
    """Wrap ``cutlass.cute.compile`` to warn on compilation."""
    global _cutedsl_hook_installed
    if _cutedsl_hook_installed:
        return

    try:
        import cutlass.cute as cute
    except Exception:
        logger.debug("CuTeDSL is not available; skipping CuTeDSL JIT monitor.")
        return

    original_compile = cute.compile

    @functools.wraps(original_compile)
    def _compile_with_monitor(*args, **kwargs):
        kernel = args[0] if args else kwargs.get("function")
        kernel_name = getattr(kernel, "__name__", None)
        if kernel_name is None:
            kernel_name = (
                kernel.__class__.__name__ if kernel is not None else "<unknown>"
            )
        _log_cutedsl_jit_compile(kernel_name)
        return original_compile(*args, **kwargs)

    cute.compile = _compile_with_monitor
    _cutedsl_hook_installed = True

_setup_triton_autotuning_print()

Enable TRITON_PRINT_AUTOTUNING unless the user opted out.

Source code in vllm/utils/jit_monitor.py
def _setup_triton_autotuning_print() -> None:
    """Enable ``TRITON_PRINT_AUTOTUNING`` unless the user opted out."""
    if not HAS_TRITON:
        return
    from triton import knobs  # type: ignore[import-untyped]

    user_val = os.environ.get("TRITON_PRINT_AUTOTUNING")
    if user_val == "0":
        logger.debug(
            "TRITON_PRINT_AUTOTUNING=0 set by user; "
            "autotuning messages will stay suppressed."
        )
        return

    knobs.autotuning.print = True

_setup_triton_jit_hook()

Register a jit_post_compile_hook that warns on compilation.

Source code in vllm/utils/jit_monitor.py
def _setup_triton_jit_hook() -> None:
    """Register a ``jit_post_compile_hook`` that warns on compilation."""
    if not HAS_TRITON:
        return
    from triton import knobs  # type: ignore[import-untyped]

    existing_hook = knobs.runtime.jit_post_compile_hook

    def _on_jit_compile(**kwargs):
        # `jit_post_compile_hook` is Triton internal API and its
        # signature has changed across releases (kwargs added/renamed).
        # Accept **kwargs so an upstream change cannot crash this hook
        # with TypeError, and forward the full kwarg set to any
        # pre-existing hook unchanged.
        fn = kwargs.get("fn")
        fn_name = getattr(fn, "name", "<unknown>")
        _log_triton_jit_compile(fn_name, kwargs)
        if existing_hook is not None:
            return existing_hook(**kwargs)
        return None

    knobs.runtime.jit_post_compile_hook = _on_jit_compile

activate(*, mode='warn', verbose=False)

Enable JIT compilation monitoring after warmup.

Call once per worker process at the end of :func:compile_or_warm_up_model. After activation every monitored kernel compilation or autotuning benchmark that happens during inference will be logged as a warning or raised as an error, depending on mode.

Safe to call multiple times; subsequent calls are no-ops.

If the user has explicitly set TRITON_PRINT_AUTOTUNING=0 in their environment, autotuning printing is left disabled; the JIT compilation hook is still registered regardless.

Source code in vllm/utils/jit_monitor.py
def activate(*, mode: JitMonitorMode = "warn", verbose: bool = False) -> None:
    """Enable JIT compilation monitoring after warmup.

    Call once per worker process at the end of
    :func:`compile_or_warm_up_model`. After activation every monitored kernel
    compilation or autotuning benchmark that happens during inference will be
    logged as a warning or raised as an error, depending on ``mode``.

    Safe to call multiple times; subsequent calls are no-ops.

    If the user has explicitly set ``TRITON_PRINT_AUTOTUNING=0`` in
    their environment, autotuning printing is left disabled; the JIT
    compilation hook is still registered regardless.
    """
    global _active, _mode, _verbose
    if _active:
        return
    if mode not in ("warn", "error"):
        raise ValueError(f"Unsupported JIT monitor mode: {mode!r}")
    _active = True
    _mode = mode
    _verbose = verbose

    _setup_triton_autotuning_print()
    _setup_triton_jit_hook()
    _setup_cutedsl_jit_hook()

    logger.info(
        "Kernel JIT monitor activated; monitored JIT compilations during "
        "inference will use mode=%s.",
        mode,
    )

is_active()

Return whether the JIT compilation monitor is currently active.

Source code in vllm/utils/jit_monitor.py
def is_active() -> bool:
    """Return whether the JIT compilation monitor is currently active."""
    return _active