Skip to content

vllm.renderers.online_derenderer

_convert_chat_logprobs_to_completion_logprobs(logprobs)

Convert ChatCompletionLogProbs (per-token objects) to CompletionLogProbs (parallel flat lists) as required by the /v1/completions response schema.

Source code in vllm/renderers/online_derenderer.py
def _convert_chat_logprobs_to_completion_logprobs(
    logprobs: ChatCompletionLogProbs,
) -> CompletionLogProbs:
    """Convert ChatCompletionLogProbs (per-token objects) to CompletionLogProbs
    (parallel flat lists) as required by the /v1/completions response schema."""
    if logprobs.content is None:
        return CompletionLogProbs()

    tokens: list[str] = []
    token_logprobs: list[float | None] = []
    top_logprobs_list: list[dict[str, float] | None] = []
    text_offset: list[int] = []

    offset = 0
    for entry in logprobs.content:
        text_offset.append(offset)
        tokens.append(entry.token)
        token_logprobs.append(entry.logprob)
        top_logprobs_list.append(
            {t.token: t.logprob for t in entry.top_logprobs}
            if entry.top_logprobs
            else None
        )
        offset += len(entry.token)

    return CompletionLogProbs(
        text_offset=text_offset,
        token_logprobs=token_logprobs,
        tokens=tokens,
        top_logprobs=top_logprobs_list,
    )

_correct_decoded_token(token_id, context_token_ids, tokenizer)

Use preceding tokens as context to fix U+FFFD from byte-fallback.

Mirrors LogprobsProcessor._correct_decoded_token in v1/engine/logprobs.py.

Source code in vllm/renderers/online_derenderer.py
def _correct_decoded_token(
    token_id: int, context_token_ids: list[int], tokenizer: TokenizerLike
) -> str:
    """Use preceding tokens as context to fix U+FFFD from byte-fallback.

    Mirrors LogprobsProcessor._correct_decoded_token in v1/engine/logprobs.py.
    """
    max_ctx = min(len(context_token_ids), 4)

    for num_ctx in range(1, max_ctx + 1):
        context = context_token_ids[-num_ctx:]
        full_decoded = tokenizer.decode(context + [token_id])

        if full_decoded.endswith("�"):
            continue

        clean_end = len(context)
        for j in range(len(context) - 1, -1, -1):
            if tokenizer.decode([context[j]]).endswith("�"):
                clean_end = j
            else:
                break

        clean_prefix = tokenizer.decode(context[:clean_end]) if clean_end > 0 else ""

        if full_decoded.startswith(clean_prefix):
            return full_decoded[len(clean_prefix) :]

        common_len = 0
        for a, b in zip(clean_prefix, full_decoded):
            if a != b:
                break
            common_len += 1
        return full_decoded[common_len:]

    return ""

_parse_token_id_placeholder(token)

Extract token ID from a 'token_id:N' placeholder string.

Source code in vllm/renderers/online_derenderer.py
def _parse_token_id_placeholder(token: str) -> int | None:
    """Extract token ID from a 'token_id:N' placeholder string."""
    if not token.startswith("token_id:"):
        return None
    try:
        return int(token[len("token_id:") :])
    except ValueError:
        return None

_resolve_logprobs(logprobs, tokenizer)

Resolve token_id:N placeholders in a ChatCompletionLogProbs object.

Source code in vllm/renderers/online_derenderer.py
def _resolve_logprobs(
    logprobs: ChatCompletionLogProbs, tokenizer: TokenizerLike
) -> ChatCompletionLogProbs:
    """Resolve token_id:N placeholders in a ChatCompletionLogProbs object."""
    if logprobs.content is None:
        return logprobs

    context_token_ids: list[int] = []
    resolved_content = []

    for entry in logprobs.content:
        token_str, token_bytes = resolve_token_id_placeholder(entry.token, tokenizer)
        sampled_id = _parse_token_id_placeholder(entry.token)

        if token_str.endswith("�") and sampled_id is not None:
            token_str = _correct_decoded_token(sampled_id, context_token_ids, tokenizer)
            token_bytes = list(token_str.encode("utf-8"))

        resolved_top = []
        for top in entry.top_logprobs:
            top_str, top_bytes = resolve_token_id_placeholder(top.token, tokenizer)
            top_id = _parse_token_id_placeholder(top.token)
            if top_str.endswith("�") and top_id is not None:
                top_str = _correct_decoded_token(top_id, context_token_ids, tokenizer)
                top_bytes = list(top_str.encode("utf-8"))
            resolved_top.append(
                top.model_copy(update={"token": top_str, "bytes": top_bytes})
            )

        resolved_content.append(
            entry.model_copy(
                update={
                    "token": token_str,
                    "bytes": token_bytes,
                    "top_logprobs": resolved_top,
                }
            )
        )

        if sampled_id is not None:
            context_token_ids.append(sampled_id)

    return ChatCompletionLogProbs(content=resolved_content)