Skip to content

vllm.tracing

TRACE_HEADERS module-attribute

TRACE_HEADERS = ['traceparent', 'tracestate']

_is_otel_imported module-attribute

_is_otel_imported = True

logger module-attribute

logger = init_logger(__name__)

otel_import_error_traceback module-attribute

otel_import_error_traceback: Optional[str] = None

BaseSpanAttributes

Source code in vllm/tracing.py
class BaseSpanAttributes:  # type: ignore
    pass

SpanAttributes

Source code in vllm/tracing.py
class SpanAttributes:
    # Attribute names copied from here to avoid version conflicts:
    # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
    GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
    GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
    GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
    GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
    GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
    GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
    # Attribute names added until they are added to the semantic conventions:
    GEN_AI_REQUEST_ID = "gen_ai.request.id"
    GEN_AI_REQUEST_N = "gen_ai.request.n"
    GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
    GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
    GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
    GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
    GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
    # Time taken in the forward pass for this across all workers
    GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
        "gen_ai.latency.time_in_model_forward")
    # Time taken in the model execute function. This will include model
    # forward, block/sync across workers, cpu-gpu sync time and sampling time.
    GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
        "gen_ai.latency.time_in_model_execute")

GEN_AI_LATENCY_E2E class-attribute instance-attribute

GEN_AI_LATENCY_E2E = 'gen_ai.latency.e2e'

GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE class-attribute instance-attribute

GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
    "gen_ai.latency.time_in_model_execute"
)

GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD class-attribute instance-attribute

GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
    "gen_ai.latency.time_in_model_forward"
)

GEN_AI_LATENCY_TIME_IN_QUEUE class-attribute instance-attribute

GEN_AI_LATENCY_TIME_IN_QUEUE = (
    "gen_ai.latency.time_in_queue"
)

GEN_AI_LATENCY_TIME_IN_SCHEDULER class-attribute instance-attribute

GEN_AI_LATENCY_TIME_IN_SCHEDULER = (
    "gen_ai.latency.time_in_scheduler"
)

GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN class-attribute instance-attribute

GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = (
    "gen_ai.latency.time_to_first_token"
)

GEN_AI_REQUEST_ID class-attribute instance-attribute

GEN_AI_REQUEST_ID = 'gen_ai.request.id'

GEN_AI_REQUEST_MAX_TOKENS class-attribute instance-attribute

GEN_AI_REQUEST_MAX_TOKENS = 'gen_ai.request.max_tokens'

GEN_AI_REQUEST_N class-attribute instance-attribute

GEN_AI_REQUEST_N = 'gen_ai.request.n'

GEN_AI_REQUEST_TEMPERATURE class-attribute instance-attribute

GEN_AI_REQUEST_TEMPERATURE = 'gen_ai.request.temperature'

GEN_AI_REQUEST_TOP_P class-attribute instance-attribute

GEN_AI_REQUEST_TOP_P = 'gen_ai.request.top_p'

GEN_AI_RESPONSE_MODEL class-attribute instance-attribute

GEN_AI_RESPONSE_MODEL = 'gen_ai.response.model'

GEN_AI_USAGE_COMPLETION_TOKENS class-attribute instance-attribute

GEN_AI_USAGE_COMPLETION_TOKENS = (
    "gen_ai.usage.completion_tokens"
)

GEN_AI_USAGE_NUM_SEQUENCES class-attribute instance-attribute

GEN_AI_USAGE_NUM_SEQUENCES = 'gen_ai.usage.num_sequences'

GEN_AI_USAGE_PROMPT_TOKENS class-attribute instance-attribute

GEN_AI_USAGE_PROMPT_TOKENS = 'gen_ai.usage.prompt_tokens'

contains_trace_headers

contains_trace_headers(headers: Mapping[str, str]) -> bool
Source code in vllm/tracing.py
def contains_trace_headers(headers: Mapping[str, str]) -> bool:
    return any(h in headers for h in TRACE_HEADERS)

extract_trace_context

extract_trace_context(
    headers: Optional[Mapping[str, str]],
) -> Optional[Context]
Source code in vllm/tracing.py
def extract_trace_context(
        headers: Optional[Mapping[str, str]]) -> Optional[Context]:
    if is_otel_available():
        headers = headers or {}
        return TraceContextTextMapPropagator().extract(headers)
    else:
        return None

extract_trace_headers

extract_trace_headers(
    headers: Mapping[str, str],
) -> Mapping[str, str]
Source code in vllm/tracing.py
def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:

    return {h: headers[h] for h in TRACE_HEADERS if h in headers}

get_span_exporter

get_span_exporter(endpoint)
Source code in vllm/tracing.py
def get_span_exporter(endpoint):
    protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
    if protocol == "grpc":
        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
            OTLPSpanExporter)
    elif protocol == "http/protobuf":
        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
            OTLPSpanExporter)  # type: ignore
    else:
        raise ValueError(
            f"Unsupported OTLP protocol '{protocol}' is configured")

    return OTLPSpanExporter(endpoint=endpoint)

init_tracer

init_tracer(
    instrumenting_module_name: str,
    otlp_traces_endpoint: str,
) -> Optional[Tracer]
Source code in vllm/tracing.py
def init_tracer(instrumenting_module_name: str,
                otlp_traces_endpoint: str) -> Optional[Tracer]:
    if not is_otel_available():
        raise ValueError(
            "OpenTelemetry is not available. Unable to initialize "
            "a tracer. Ensure OpenTelemetry packages are installed. "
            f"Original error:\n{otel_import_error_traceback}")
    trace_provider = TracerProvider()

    span_exporter = get_span_exporter(otlp_traces_endpoint)
    trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
    set_tracer_provider(trace_provider)

    tracer = trace_provider.get_tracer(instrumenting_module_name)
    return tracer

is_otel_available

is_otel_available() -> bool
Source code in vllm/tracing.py
def is_otel_available() -> bool:
    return _is_otel_imported

log_tracing_disabled_warning

log_tracing_disabled_warning() -> None
Source code in vllm/tracing.py
@run_once
def log_tracing_disabled_warning() -> None:
    logger.warning(
        "Received a request with trace context but tracing is disabled")