Skip to content

vllm.parser.nemotron_v3

Nemotron V3 parser.

The Nemotron 3 Super model uses the same tool call and reasoning format as Qwen3 (<think>/</think> + <tool_call> XML). This config reuses :func:qwen3_config with a distinct name.

When enable_thinking=False or force_nonempty_content=True and content is empty, reasoning and content are swapped.

Classes:

  • NemotronV3Parser

    Nemotron V3 parser: same format as Qwen3, with Nemotron-specific

NemotronV3Parser

Bases: Qwen3Parser

Nemotron V3 parser: same format as Qwen3, with Nemotron-specific behavior: when enable_thinking=False or force_nonempty_content=True and content is empty, swaps reasoning and content.

Source code in vllm/parser/nemotron_v3.py
class NemotronV3Parser(Qwen3Parser):
    """Nemotron V3 parser: same format as Qwen3, with Nemotron-specific
    behavior: when ``enable_thinking=False`` or
    ``force_nonempty_content=True`` and content is empty, swaps
    reasoning and content.
    """

    def __init__(
        self,
        tokenizer: TokenizerLike,
        tools: list[Tool] | None = None,
        **kwargs,
    ) -> None:
        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
        thinking = chat_kwargs.get("enable_thinking", True)
        super().__init__(
            tokenizer,
            tools,
            parser_engine_config=nemotron_v3_config(thinking=thinking),
            **kwargs,
        )
        self._streamed_reasoning: list[str] = []

    def _reset(self, initial_state=None) -> None:
        super()._reset(initial_state=initial_state)
        self._streamed_reasoning = []

    def _events_to_delta(
        self,
        events: list[SemanticEvent],
        finished: bool = False,
    ) -> DeltaMessage | None:
        delta = super()._events_to_delta(events, finished=finished)
        if delta is not None and delta.reasoning is not None:
            self._streamed_reasoning.append(delta.reasoning)
        return delta

    @staticmethod
    def _should_force_content(
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> bool:
        chat_template_kwargs = getattr(request, "chat_template_kwargs", None)
        return bool(
            chat_template_kwargs
            and (
                chat_template_kwargs.get("enable_thinking") is False
                or chat_template_kwargs.get("force_nonempty_content") is True
            )
        )

    def get_streaming_fallback_content(
        self,
        text: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> str | None:
        if not self._should_force_content(request):
            return None
        return "".join(self._streamed_reasoning) or None

    def extract_reasoning(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> tuple[str | None, str | None]:
        reasoning, content = super().extract_reasoning(model_output, request)

        if self._should_force_content(request) and (
            content is None or not content.strip()
        ):
            reasoning, content = content, reasoning

        return reasoning, content