`vllm.parser.harmony` ¶

Classes:

HarmonyParser –

`HarmonyParser` ¶

Bases: DelegatingParser

Methods:

parse –

Parse Harmony output from token IDs.

Source code in vllm/parser/harmony.py

class HarmonyParser(DelegatingParser):
    def __init__(self, tokenizer, tools=None, *args, **kwargs):
        super().__init__(tokenizer, tools, *args, **kwargs)

        if self.reasoning_parser and not isinstance(
            self.reasoning_parser, GptOssReasoningParser
        ):
            raise ValueError(
                "Harmony requires GptOssReasoningParser, "
                f"got {self.reasoning_parser.__class__.__name__}."
            )

        if self.tool_parser and not isinstance(self.tool_parser, GptOssToolParser):
            raise ValueError(
                "Harmony requires GptOssToolParser, "
                f"got {self.tool_parser.__class__.__name__}."
            )

        self._parser: StreamableParser | None = None
        self._next_tool_call_index = 0
        self._num_processed_messages = 0

        # For error recovery
        self._current_message_tokens: list[int] = []

    @property
    def _harmony_parser(self) -> StreamableParser:
        """Lazily initializes the Harmony parser."""
        if self._parser is None:
            self._parser = get_streamable_parser_for_assistant()
        return self._parser

    def _poll_completed_message(self) -> Message | None:
        messages = self._harmony_parser.messages
        if len(messages) <= self._num_processed_messages:
            return None
        msg = messages[self._num_processed_messages]
        msg.recipient = self._normalize_recipient(msg.recipient)
        self._num_processed_messages += 1
        return msg

    def flush(self) -> list[Segment]:
        segments: list[Segment] = []
        try:
            self._harmony_parser.process_eos()
            msg = self._poll_completed_message()
        except HarmonyError:
            logger.warning(
                "Harmony parser ended in a non-terminal state; returning the "
                "recovered raw output."
            )

            final_channel = "final"
            text = self.model_tokenizer.decode(self._current_message_tokens)
            segments.append(
                Segment(
                    channel=final_channel,
                    recipient=None,
                    delta=text,
                    completed_message=None,
                )
            )
            msg = Message.from_role_and_content(Role.ASSISTANT, text).with_channel(
                final_channel
            )

        # Reset to the initial assistant-parser state for the next turn.
        self._parser = None
        self._num_processed_messages = 0
        self._current_message_tokens.clear()

        if msg is None:
            return segments

        segments.append(
            Segment(
                channel=msg.channel,
                recipient=msg.recipient,
                delta="",
                completed_message=msg,
            )
        )
        return segments

    def parse(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
        enable_auto_tools: bool = False,
        model_output_token_ids: Sequence[int] = (),
    ) -> tuple[str | None, str | None, list[FunctionCall] | None]:
        """Parse Harmony output from token IDs.

        Tool calls are always extracted regardless of ``enable_auto_tools``.
        Callers must decide whether to surface them.
        """
        result = self.process_chunk(model_output_token_ids)
        flushed_segments = self.flush()
        if flushed_segments:
            result.segments.extend(flushed_segments)

        reasoning_parts: list[str] = []
        content_parts: list[str] = []
        tool_calls: list[FunctionCall] = []

        for segment in result.segments:
            msg = segment.completed_message
            if msg is None:
                continue
            if msg.author.role != "assistant" or not msg.content:
                continue
            text = msg.content[0].text
            segment_type = _SegmentType.from_channel_and_recipient(
                msg.channel, msg.recipient
            )
            match segment_type:
                case _SegmentType.REASONING if self.reasoning_parser and text:
                    reasoning_parts.append(text)
                case _SegmentType.CONTENT if text:
                    content_parts.append(text)
                case _SegmentType.TOOL if self.tool_parser:
                    recipient = msg.recipient
                    content_type = msg.content_type
                    assert recipient is not None
                    if content_type is not None and "json" not in content_type:
                        arguments = text
                    else:
                        try:
                            arguments = json.dumps(json.loads(text))
                        except json.JSONDecodeError:
                            arguments = text
                    tool_calls.append(
                        FunctionCall(
                            name=extract_function_from_recipient(recipient),
                            arguments=arguments,
                        )
                    )

        reasoning = "\n".join(reasoning_parts) or None
        content = "\n".join(content_parts) or None
        return reasoning, content, tool_calls or None

    def parse_delta(
        self,
        delta_text: str,
        delta_token_ids: list[int],
        request: ChatCompletionRequest | ResponsesRequest,
        prompt_token_ids: list[int] | None = None,
        *,
        finished: bool,
    ) -> DeltaMessage | None:
        prev_recipient = self._normalize_recipient(
            self._harmony_parser.current_recipient
        )
        result = self.process_chunk(delta_token_ids)
        if finished:
            flushed_segments = self.flush()
            if flushed_segments:
                result.segments.extend(flushed_segments)
        combined_content = ""
        combined_reasoning = ""
        tool_messages: list[DeltaToolCall] = []

        for segment in result.segments:
            if segment.completed_message is not None:
                prev_recipient = None
                continue

            segment_type = _SegmentType.from_channel_and_recipient(
                segment.channel, segment.recipient
            )
            match segment_type:
                case _SegmentType.REASONING if self.reasoning_parser:
                    combined_reasoning += segment.delta
                case _SegmentType.CONTENT:
                    combined_content += segment.delta
                case _SegmentType.TOOL if self.tool_parser:
                    assert segment.recipient is not None
                    if prev_recipient != segment.recipient:
                        tool_name = extract_function_from_recipient(segment.recipient)
                        tool_messages.append(
                            DeltaToolCall(
                                # HarmonyParser does not use _stream_state;
                                # "random" tool_call_id_type is always used
                                id=make_tool_call_id(),
                                type="function",
                                function=DeltaFunctionCall(
                                    name=tool_name,
                                    arguments=segment.delta,
                                ),
                                index=self._next_tool_call_index,
                            )
                        )
                        self._next_tool_call_index += 1
                        prev_recipient = segment.recipient
                    elif segment.delta:
                        idx = self._next_tool_call_index - 1
                        if tool_messages:
                            tool_msg = tool_messages[-1]
                            assert tool_msg.index == idx
                            fn = tool_msg.function
                            assert fn is not None and fn.arguments is not None
                            fn.arguments += segment.delta
                        else:
                            tool_messages.append(
                                DeltaToolCall(
                                    index=idx,
                                    function=DeltaFunctionCall(arguments=segment.delta),
                                )
                            )

        if finished:
            self._next_tool_call_index = 0

        if not combined_content and not combined_reasoning and not tool_messages:
            return None

        delta_message = DeltaMessage()
        if combined_content:
            delta_message.content = combined_content
        if combined_reasoning:
            delta_message.reasoning = combined_reasoning
        if tool_messages:
            delta_message.tool_calls = tool_messages

        # Suppress reasoning deltas if not requested
        if delta_message and not request.include_reasoning:
            delta_message.reasoning = None

            # If only reasoning was in the message (no content, no tool_calls)
            # skip emitting entirely
            if not delta_message.content and not delta_message.tool_calls:
                return None

        return delta_message

    def process_chunk(self, token_ids: Sequence[int]) -> ChunkResult:
        if not token_ids:
            return ChunkResult(segments=[], reasoning_token_count=0)

        segments: list[Segment] = []
        reasoning_token_count = 0
        for token_id in token_ids:
            self._harmony_parser.process(token_id)
            channel = self._harmony_parser.current_channel
            recipient = self._normalize_recipient(
                self._harmony_parser.current_recipient
            )
            delta = self._harmony_parser.last_content_delta or ""
            completed_message = self._poll_completed_message()

            if completed_message is not None:
                self._current_message_tokens.clear()
            else:
                self._current_message_tokens.append(token_id)

            if channel == "analysis" or (
                channel == "commentary" and recipient is not None
            ):
                reasoning_token_count += 1

            segments.append(
                Segment(
                    channel=channel,
                    recipient=recipient,
                    delta=delta,
                    completed_message=completed_message,
                )
            )

            # TODO: Optionally merge and suppress empty Segments

        return ChunkResult(
            segments=segments,
            reasoning_token_count=reasoning_token_count,
        )

    @staticmethod
    def _normalize_recipient(recipient: str | None) -> str | None:
        """Remove constrained formats misparsed into recipients by older Harmony."""
        if recipient is None:
            return None

        constrain_index = recipient.find("<|constrain|>")
        if constrain_index == -1:
            return recipient
        return recipient[:constrain_index].rstrip() or None

`_harmony_parser` `property` ¶

Lazily initializes the Harmony parser.

`_normalize_recipient(recipient)` `staticmethod` ¶

Remove constrained formats misparsed into recipients by older Harmony.

Source code in vllm/parser/harmony.py

@staticmethod
def _normalize_recipient(recipient: str | None) -> str | None:
    """Remove constrained formats misparsed into recipients by older Harmony."""
    if recipient is None:
        return None

    constrain_index = recipient.find("<|constrain|>")
    if constrain_index == -1:
        return recipient
    return recipient[:constrain_index].rstrip() or None

`parse(model_output, request, enable_auto_tools=False, model_output_token_ids=())` ¶

Parse Harmony output from token IDs.

Tool calls are always extracted regardless of enable_auto_tools. Callers must decide whether to surface them.

Source code in vllm/parser/harmony.py

def parse(
    self,
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
    enable_auto_tools: bool = False,
    model_output_token_ids: Sequence[int] = (),
) -> tuple[str | None, str | None, list[FunctionCall] | None]:
    """Parse Harmony output from token IDs.

    Tool calls are always extracted regardless of ``enable_auto_tools``.
    Callers must decide whether to surface them.
    """
    result = self.process_chunk(model_output_token_ids)
    flushed_segments = self.flush()
    if flushed_segments:
        result.segments.extend(flushed_segments)

    reasoning_parts: list[str] = []
    content_parts: list[str] = []
    tool_calls: list[FunctionCall] = []

    for segment in result.segments:
        msg = segment.completed_message
        if msg is None:
            continue
        if msg.author.role != "assistant" or not msg.content:
            continue
        text = msg.content[0].text
        segment_type = _SegmentType.from_channel_and_recipient(
            msg.channel, msg.recipient
        )
        match segment_type:
            case _SegmentType.REASONING if self.reasoning_parser and text:
                reasoning_parts.append(text)
            case _SegmentType.CONTENT if text:
                content_parts.append(text)
            case _SegmentType.TOOL if self.tool_parser:
                recipient = msg.recipient
                content_type = msg.content_type
                assert recipient is not None
                if content_type is not None and "json" not in content_type:
                    arguments = text
                else:
                    try:
                        arguments = json.dumps(json.loads(text))
                    except json.JSONDecodeError:
                        arguments = text
                tool_calls.append(
                    FunctionCall(
                        name=extract_function_from_recipient(recipient),
                        arguments=arguments,
                    )
                )

    reasoning = "\n".join(reasoning_parts) or None
    content = "\n".join(content_parts) or None
    return reasoning, content, tool_calls or None

vllm.parser.harmony ¶

HarmonyParser ¶

_harmony_parser property ¶

_normalize_recipient(recipient) staticmethod ¶

parse(model_output, request, enable_auto_tools=False, model_output_token_ids=()) ¶

`vllm.parser.harmony` ¶

`HarmonyParser` ¶

`_harmony_parser` `property` ¶

`_normalize_recipient(recipient)` `staticmethod` ¶

`parse(model_output, request, enable_auto_tools=False, model_output_token_ids=())` ¶