`vllm.utils.jit_monitor` ¶

Monitor unexpected kernel JIT compilation during inference.

After server warmup completes, any kernel JIT compilation or autotuning event indicates a cache miss or unexpected input shape that causes a latency spike. This module registers hooks in supported runtimes to detect such events so they can be investigated.

Set --jit-monitor-mode=error to fail fast on unexpected runtime compilation. Set --jit-monitor-verbose to log every JIT compile with additional runtime details. Verbose logging is intentionally opt-in because it can emit many logs and add overhead.

Currently monitors: - CuTeDSL cute.compile calls - Triton @triton.autotune cache misses (via knobs.autotuning.print) - Triton @triton.jit first-time compilations (via knobs.runtime.jit_post_compile_hook) - TileLang @tilelang.jit first-time compilations

Functions:

activate –

Enable JIT compilation monitoring after warmup.
is_active –

Return whether the JIT compilation monitor is currently active.
numba_workqueue_threading_layer –

Force numba's fork-safe workqueue threading layer for this block.

`_setup_cutedsl_jit_hook()` ¶

Wrap cutlass.cute.compile to warn on compilation.

Source code in vllm/utils/jit_monitor.py

def _setup_cutedsl_jit_hook() -> None:
    """Wrap ``cutlass.cute.compile`` to warn on compilation."""
    global _cutedsl_hook_installed
    if _cutedsl_hook_installed:
        return

    try:
        import cutlass.cute as cute
    except Exception:
        logger.debug("CuTeDSL is not available; skipping CuTeDSL JIT monitor.")
        return

    original_compile = cute.compile

    @functools.wraps(original_compile)
    def _compile_with_monitor(*args, **kwargs):
        kernel = args[0] if args else kwargs.get("function")
        kernel_name = getattr(kernel, "__name__", None)
        if kernel_name is None:
            kernel_name = (
                kernel.__class__.__name__ if kernel is not None else "<unknown>"
            )
        _log_cutedsl_jit_compile(kernel_name)
        return original_compile(*args, **kwargs)

    cute.compile = _compile_with_monitor
    _cutedsl_hook_installed = True

`_setup_tilelang_jit_hook()` ¶

Wrap TileLang JIT entry points to warn on compilation.

Source code in vllm/utils/jit_monitor.py

def _setup_tilelang_jit_hook() -> None:
    """Wrap TileLang JIT entry points to warn on compilation."""
    global _tilelang_hook_installed
    if _tilelang_hook_installed:
        return

    try:
        tilelang_kernel = importlib.import_module("tilelang.jit.kernel")
    except Exception:
        logger.debug("TileLang is not available; skipping TileLang JIT monitor.")
        return

    jit_kernel_cls = getattr(tilelang_kernel, "JITKernel", None)
    if jit_kernel_cls is None:
        logger.debug(
            "TileLang JITKernel is unavailable; skipping TileLang JIT monitor."
        )
        return

    try:
        tilelang_jit = importlib.import_module("tilelang.jit")
    except Exception:
        tilelang_jit = None
    jit_impl_cls = getattr(tilelang_jit, "JITImpl", None)
    original_init = jit_kernel_cls.__init__

    @functools.wraps(original_init)
    def _init_with_monitor(self, *args, **kwargs):
        from_database = bool(_tilelang_arg(args, kwargs, 7, "from_database", False))
        if not from_database and _tilelang_jitimpl_compile_depth == 0:
            func = _tilelang_arg(args, kwargs, 0, "func")
            _log_tilelang_jit_compile(_tilelang_kernel_name(func))
        return original_init(self, *args, **kwargs)

    jit_kernel_cls.__init__ = _init_with_monitor

    if jit_impl_cls is not None:
        original_call = jit_impl_cls.__call__

        @functools.wraps(original_call)
        def _call_with_monitor(self, *args, **kwargs):
            global _tilelang_jitimpl_compile_depth
            cache_key = _tilelang_cache_miss_key(self, args, kwargs)
            if cache_key is None:
                return original_call(self, *args, **kwargs)

            _tilelang_jitimpl_compile_depth += 1
            try:
                detail = None
                if _verbose:
                    detail = _format_verbose_tilelang_compile_details(
                        self, args, kwargs, cache_key
                    )
                func = getattr(self, "func", None)
                orig_func = getattr(func, "orig_func", None)
                _log_tilelang_jit_compile(
                    _tilelang_kernel_name(orig_func or func), detail
                )
                return original_call(self, *args, **kwargs)
            finally:
                _tilelang_jitimpl_compile_depth -= 1

        jit_impl_cls.__call__ = _call_with_monitor

    _tilelang_hook_installed = True

`_setup_triton_autotuning_print()` ¶

Enable TRITON_PRINT_AUTOTUNING unless the user opted out.

Source code in vllm/utils/jit_monitor.py

def _setup_triton_autotuning_print() -> None:
    """Enable ``TRITON_PRINT_AUTOTUNING`` unless the user opted out."""
    if not HAS_TRITON:
        return
    from triton import knobs  # type: ignore[import-untyped]

    user_val = os.environ.get("TRITON_PRINT_AUTOTUNING")
    if user_val == "0":
        logger.debug(
            "TRITON_PRINT_AUTOTUNING=0 set by user; "
            "autotuning messages will stay suppressed."
        )
        return

    knobs.autotuning.print = True

`_setup_triton_jit_hook()` ¶

Register a jit_post_compile_hook that warns on compilation.

Source code in vllm/utils/jit_monitor.py

def _setup_triton_jit_hook() -> None:
    """Register a ``jit_post_compile_hook`` that warns on compilation."""
    if not HAS_TRITON:
        return
    from triton import knobs  # type: ignore[import-untyped]

    existing_hook = knobs.runtime.jit_post_compile_hook

    def _on_jit_compile(**kwargs):
        # `jit_post_compile_hook` is Triton internal API and its
        # signature has changed across releases (kwargs added/renamed).
        # Accept **kwargs so an upstream change cannot crash this hook
        # with TypeError, and forward the full kwarg set to any
        # pre-existing hook unchanged.
        fn = kwargs.get("fn")
        fn_name = getattr(fn, "name", "<unknown>")
        _log_triton_jit_compile(fn_name, kwargs)
        if existing_hook is not None:
            return existing_hook(**kwargs)
        return None

    knobs.runtime.jit_post_compile_hook = _on_jit_compile

`activate(*, mode='warn', verbose=False)` ¶

Enable JIT compilation monitoring after warmup.

Call once per worker process at the end of :func:compile_or_warm_up_model. After activation every monitored kernel compilation or autotuning benchmark that happens during inference will be logged as a warning or raised as an error, depending on mode.

Safe to call multiple times; subsequent calls are no-ops.

If the user has explicitly set TRITON_PRINT_AUTOTUNING=0 in their environment, autotuning printing is left disabled; the JIT compilation hook is still registered regardless.

Source code in vllm/utils/jit_monitor.py

def activate(*, mode: JitMonitorMode = "warn", verbose: bool = False) -> None:
    """Enable JIT compilation monitoring after warmup.

    Call once per worker process at the end of
    :func:`compile_or_warm_up_model`. After activation every monitored kernel
    compilation or autotuning benchmark that happens during inference will be
    logged as a warning or raised as an error, depending on ``mode``.

    Safe to call multiple times; subsequent calls are no-ops.

    If the user has explicitly set ``TRITON_PRINT_AUTOTUNING=0`` in
    their environment, autotuning printing is left disabled; the JIT
    compilation hook is still registered regardless.
    """
    global _active, _mode, _verbose
    if _active:
        return
    if mode not in ("warn", "error"):
        raise ValueError(f"Unsupported JIT monitor mode: {mode!r}")
    _active = True
    _mode = mode
    _verbose = verbose

    _setup_triton_autotuning_print()
    _setup_triton_jit_hook()
    _setup_cutedsl_jit_hook()
    _setup_tilelang_jit_hook()

    logger.info(
        "Kernel JIT monitor activated; monitored JIT compilations during "
        "inference will use mode=%s.",
        mode,
    )

`is_active()` ¶

Return whether the JIT compilation monitor is currently active.

Source code in vllm/utils/jit_monitor.py

def is_active() -> bool:
    """Return whether the JIT compilation monitor is currently active."""
    return _active

`numba_workqueue_threading_layer()` ¶

Force numba's fork-safe workqueue threading layer for this block.

GNU OpenMP (numba's default omp threading layer) aborts the process if a forked child re-enters an OpenMP-active runtime. vLLM forks the EngineCore subprocess from a process that may already have launched numba's parallel accelerator, so the first call to any @njit(parallel=True) function must happen under workqueue instead. The threading layer choice is sticky for the life of the process once launched, so restoring the config on exit does not undo the effect.

Source code in vllm/utils/jit_monitor.py

@contextlib.contextmanager
def numba_workqueue_threading_layer() -> Iterator[None]:
    """Force numba's fork-safe `workqueue` threading layer for this block.

    GNU OpenMP (numba's default `omp` threading layer) aborts the process
    if a forked child re-enters an OpenMP-active runtime. vLLM forks the
    EngineCore subprocess from a process that may already have launched
    numba's parallel accelerator, so the first call to any
    `@njit(parallel=True)` function must happen under `workqueue` instead.
    The threading layer choice is sticky for the life of the process once
    launched, so restoring the config on exit does not undo the effect.
    """
    import numba

    key = "NUMBA_THREADING_LAYER"
    previous_env = os.environ.get(key)
    previous_config = numba.config.THREADING_LAYER
    os.environ[key] = "workqueue"
    numba.config.THREADING_LAYER = "workqueue"
    try:
        yield
    finally:
        if previous_env is None:
            os.environ.pop(key, None)
        else:
            os.environ[key] = previous_env
        numba.config.THREADING_LAYER = previous_config

vllm.utils.jit_monitor ¶

_setup_cutedsl_jit_hook() ¶

_setup_tilelang_jit_hook() ¶

_setup_triton_autotuning_print() ¶

_setup_triton_jit_hook() ¶

activate(*, mode='warn', verbose=False) ¶

is_active() ¶

numba_workqueue_threading_layer() ¶

`vllm.utils.jit_monitor` ¶

`_setup_cutedsl_jit_hook()` ¶

`_setup_tilelang_jit_hook()` ¶

`_setup_triton_autotuning_print()` ¶

`_setup_triton_jit_hook()` ¶

`activate(*, mode='warn', verbose=False)` ¶

`is_active()` ¶

`numba_workqueue_threading_layer()` ¶