Skip to content

vllm.parser.abstract_parser

Classes:

  • DelegatingParser

    A Parser implementation that delegates to separate ReasoningParser and

  • Parser

    Abstract Parser class that unifies ReasoningParser and ToolParser into

  • StreamState

    Mutable state for Parser.parse_delta(). One per stream.

DelegatingParser

Bases: Parser

A Parser implementation that delegates to separate ReasoningParser and ToolParser instances.

This is the recommended base class for creating model-specific parsers that combine existing reasoning and tool parser implementations. Subclasses should set self._reasoning_parser and self._tool_parser in their __init__ method.

If either parser is None, the corresponding methods will return default values (no reasoning extraction, no tool calls).

Methods:

Source code in vllm/parser/abstract_parser.py
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
class DelegatingParser(Parser):
    """
    A Parser implementation that delegates to separate ReasoningParser and
    ToolParser instances.

    This is the recommended base class for creating model-specific parsers
    that combine existing reasoning and tool parser implementations.
    Subclasses should set `self._reasoning_parser` and `self._tool_parser`
    in their `__init__` method.

    If either parser is None, the corresponding methods will return default
    values (no reasoning extraction, no tool calls).
    """

    def extract_reasoning(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> tuple[str | None, str | None]:
        if self._reasoning_parser is None:
            return None, model_output
        return self._reasoning_parser.extract_reasoning(model_output, request)

    def _get_function_name(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> str:
        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
            return request.tool_choice.name
        if request.tool_choice and isinstance(
            request.tool_choice, ChatCompletionNamedToolChoiceParam
        ):
            return request.tool_choice.function.name
        raise ValueError("Invalid tool_choice for function name extraction.")

    def _extract_tool_calls(
        self,
        content: str | None,
        request: ChatCompletionRequest | ResponsesRequest,
        enable_auto_tools: bool = False,
    ) -> tuple[list[FunctionCall] | None, str | None]:
        tool_parser = self._tool_parser
        if tool_parser is None:
            return [], content

        supports_required_and_named = tool_parser.supports_required_and_named
        is_named_tool_choice = request.tool_choice and isinstance(
            request.tool_choice,
            (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
        )
        is_required_tool_choice = request.tool_choice == "required"
        is_auto_tool_choice = enable_auto_tools and (
            request.tool_choice == "auto"
            or request.tool_choice is None
            or (
                not supports_required_and_named
                and (is_named_tool_choice or is_required_tool_choice)
            )
        )

        tool_calls = list[FunctionCall]()
        if is_named_tool_choice and supports_required_and_named:
            if content is None:
                return [], None
            tool_calls.append(
                FunctionCall(
                    name=self._get_function_name(request),
                    arguments=content,
                )
            )
            content = None
        elif is_required_tool_choice and supports_required_and_named:
            # "required" with standard JSON-based parsing
            parsed_calls = []
            with contextlib.suppress(ValidationError):
                content = content or ""
                parsed_calls = TypeAdapter(list[FunctionDefinition]).validate_json(
                    content
                )
            for tc in parsed_calls:
                tool_calls.append(
                    FunctionCall(
                        name=tc.name,
                        arguments=json.dumps(tc.parameters, ensure_ascii=False),
                    )
                )
            content = None
        elif is_auto_tool_choice:
            # Automatic Tool Call Parsing (also used as fallback for
            # required/named when supports_required_and_named=False)
            tool_call_info = self.extract_tool_calls(
                content if content is not None else "",
                request=request,
            )
            if tool_call_info is not None and tool_call_info.tools_called:
                tool_calls.extend(
                    FunctionCall(
                        id=tc.id,
                        name=tc.function.name,
                        arguments=tc.function.arguments,
                    )
                    for tc in tool_call_info.tool_calls
                )
                content = tool_call_info.content
                if content and content.strip() == "":
                    content = None
            else:
                # No tool calls.
                return None, content

        return tool_calls, content

    def adjust_request(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
        if self._reasoning_parser is not None:
            request = self._reasoning_parser.adjust_request(request)
        if self._tool_parser is not None:
            request = self._apply_structural_tag(request)
        if self._tool_parser is not None:
            request = self._tool_parser.adjust_request(request)
        return request

    def _apply_structural_tag(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
        if (
            self._tool_parser is None
            or self._tool_parser.structural_tag_model is None
            or not request.tools
        ):
            return request

        need_tool_calling = (
            request.tool_choice == "auto"
            or request.tool_choice == "required"
            or isinstance(
                request.tool_choice,
                (ChatCompletionNamedToolChoiceParam, ToolChoiceFunction),
            )
        )
        if not need_tool_calling:
            return request

        structure_tag = self._tool_parser.get_structural_tag(
            request,
            reasoning=False,
        )
        if structure_tag is None:
            return request

        structural_tag = json.dumps(structure_tag.model_dump())
        request.structured_outputs = StructuredOutputsParams(
            structural_tag=structural_tag,
        )
        if isinstance(request, ResponsesRequest):
            request.text = None
        else:
            request.response_format = None
        return request

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        if self._reasoning_parser is None:
            return DeltaMessage(content=delta_text)
        return self._reasoning_parser.extract_reasoning_streaming(
            previous_text,
            current_text,
            delta_text,
            previous_token_ids,
            current_token_ids,
            delta_token_ids,
        )

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> ExtractedToolCallInformation:
        if self._tool_parser is None:
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
        result = None
        is_tool_called: bool | Exception = False
        try:
            result = self._tool_parser.extract_tool_calls(
                model_output,
                request=request,  # type: ignore[arg-type]
            )
            is_tool_called = bool(result.tools_called)
        except Exception as e:
            is_tool_called = e
            raise
        finally:
            record_tool_parser_invocation(
                is_tool_called=is_tool_called,
                is_streaming=False,
                request=request,
            )
        return result

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> DeltaMessage | None:
        if self._tool_parser is None:
            return None
        result = None
        is_tool_called: bool | Exception = False
        try:
            result = self._tool_parser.extract_tool_calls_streaming(
                previous_text,
                current_text,
                delta_text,
                previous_token_ids,
                current_token_ids,
                delta_token_ids,
                request,  # type: ignore[arg-type]
            )
            is_tool_called = bool(result and result.tool_calls)
        except Exception as e:
            is_tool_called = e
            raise
        finally:
            record_tool_parser_invocation(
                is_tool_called=is_tool_called,
                is_streaming=True,
                request=request,
            )
        return result

    def _extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest | ResponsesRequest,
        # The following parameters are used for "required" tool choice parsing and are
        # tracked in StreamState for streaming parsing.
        tool_call_idx: int | None = None,
        tool_call_id_type: str = "random",
        function_name_returned: bool = False,
    ) -> tuple[DeltaMessage | None, bool]:
        assert self._tool_parser is not None
        supports_required_and_named = self._tool_parser.supports_required_and_named

        if request.tool_choice == "none":
            if self._engine_based:
                # Engine-backed parsers route content extraction through
                # extract_tool_calls_streaming, so run the full pipeline
                # and strip tool_calls after.
                delta_message = self.extract_tool_calls_streaming(
                    previous_text,
                    current_text,
                    delta_text,
                    previous_token_ids,
                    current_token_ids,
                    delta_token_ids,
                    request,  # type: ignore[arg-type]
                )
                if delta_message:
                    delta_message.tool_calls = []
                return delta_message, False
            return (DeltaMessage(content=delta_text) if delta_text else None), False

        if (
            supports_required_and_named
            and request.tool_choice
            and isinstance(
                request.tool_choice,
                (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
            )
        ):
            delta_message, function_name_returned = extract_named_tool_call_streaming(
                delta_text=delta_text,
                function_name=self._get_function_name(request),
                function_name_returned=function_name_returned,
                tool_call_idx=tool_call_idx,
                tool_call_id_type=tool_call_id_type,
                tokenizer=self.model_tokenizer,
            )
            return delta_message, function_name_returned

        if supports_required_and_named and request.tool_choice == "required":
            delta_message, function_name_returned = (
                extract_required_tool_call_streaming(
                    previous_text=previous_text,
                    current_text=current_text,
                    delta_text=delta_text,
                    function_name_returned=function_name_returned,
                    tool_call_idx=tool_call_idx,
                    tool_call_id_type=tool_call_id_type,
                )
            )
            return delta_message, function_name_returned
        return self.extract_tool_calls_streaming(
            previous_text,
            current_text,
            delta_text,
            previous_token_ids,
            current_token_ids,
            delta_token_ids,
            request,
        ), False

    def is_reasoning_end(self, input_ids: list[int]) -> bool:
        if self._reasoning_parser is None:
            return False
        return self._reasoning_parser.is_reasoning_end(input_ids)

    def is_reasoning_end_streaming(
        self, input_ids: list[int], delta_ids: list[int]
    ) -> bool:
        if self._reasoning_parser is None:
            return False
        return self._reasoning_parser.is_reasoning_end_streaming(input_ids, delta_ids)

    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        if self._reasoning_parser is None:
            return input_ids
        return self._reasoning_parser.extract_content_ids(input_ids)

    def _in_reasoning_phase(self, state: StreamState) -> bool:
        if self._reasoning_parser is None:
            return False
        return not state.reasoning_ended

    def _in_tool_call_phase(self, state: StreamState) -> bool:
        if self._tool_parser is None:
            return False
        return state.reasoning_ended

    def _append_unstreamed_tool_args(
        self,
        delta_message: DeltaMessage | None,
    ) -> None:
        """Append parsed-but-unstreamed tool-call arguments to *delta_message*."""
        if (
            self._tool_parser is not None
            and delta_message
            and delta_message.tool_calls
            and (last_tc := delta_message.tool_calls[-1]).function
        ):
            last_tc.function.arguments = (
                last_tc.function.arguments or ""
            ) + self._tool_parser.get_remaining_unstreamed_args()

    def finalize_generation(
        self,
        delta_message: DeltaMessage | None,
        request: ChatCompletionRequest | ResponsesRequest,
        state: StreamState,
    ) -> DeltaMessage | None:
        """Finalize generation for cases where generation was incomplete.
        For example, if streaming terminated before reasoning ended
        """
        fallback_fn = getattr(
            self._reasoning_parser, "get_streaming_fallback_content", None
        )
        if fallback_fn is not None and not state.reasoning_ended:
            promoted = fallback_fn(state.previous_text, request)
            if promoted:
                if delta_message is None:
                    delta_message = DeltaMessage()
                delta_message.content = (delta_message.content or "") + promoted

        self._append_unstreamed_tool_args(delta_message)
        return delta_message

    def parse(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
        enable_auto_tools: bool = False,
        model_output_token_ids: Sequence[int] = (),
    ) -> tuple[str | None, str | None, list[FunctionCall] | None]:
        reasoning, content = self.extract_reasoning(model_output, request)
        tool_calls, content = self._extract_tool_calls(
            content=content,
            request=request,
            enable_auto_tools=enable_auto_tools,
        )
        return reasoning, content, tool_calls

    def parse_delta(
        self,
        delta_text: str,
        delta_token_ids: list[int],
        request: ChatCompletionRequest | ResponsesRequest,
        prompt_token_ids: list[int] | None = None,
        *,
        finished: bool,
    ) -> DeltaMessage | None:
        state = self._stream_state

        if not state.prompt_reasoning_checked and prompt_token_ids is not None:
            state.prompt_reasoning_checked = True
            if self._reasoning_parser is None or self.is_reasoning_end(
                prompt_token_ids
            ):
                state.reasoning_ended = True
            else:
                # Reasoning is still open at the end of the prompt; let the
                # reasoning parser adjust its initial parsing state so the
                # first generated tokens are classified correctly.
                self._reasoning_parser.adjust_initial_state_from_prompt(
                    prompt_token_ids
                )

        current_text, current_token_ids = state.advance(delta_text, delta_token_ids)
        delta_message: DeltaMessage | None = None
        reasoning_transitioned = False

        # Reasoning extraction
        if self._in_reasoning_phase(state):
            delta_message = self.extract_reasoning_streaming(
                previous_text=state.previous_text,
                current_text=current_text,
                delta_text=delta_text,
                previous_token_ids=state.previous_token_ids,
                current_token_ids=current_token_ids,
                delta_token_ids=delta_token_ids,
            )
            reasoning_parser = self._reasoning_parser
            if reasoning_parser is not None and reasoning_parser.engine_based_streaming:
                should_transition = (
                    reasoning_parser.has_engine_confirmed_reasoning_end()
                )
            else:
                should_transition = self.is_reasoning_end_streaming(
                    current_token_ids, delta_token_ids
                )
            if should_transition:
                state.reasoning_ended = True
                reasoning_transitioned = True
                current_token_ids = self.extract_content_ids(delta_token_ids)
                if self._engine_based:
                    flush_delta = reasoning_parser.finish_streaming()  # type: ignore[union-attr, attr-defined]
                    current_text = (
                        (delta_message.content if delta_message else None) or ""
                    ) + ((flush_delta.content if flush_delta else None) or "")
                    if delta_message and self._tool_parser is not None:
                        delta_message.content = None
                else:
                    current_text = (
                        delta_message.content
                        if delta_message and delta_message.content
                        else ""
                    )
                    delta_text = current_text

        # Tool call extraction
        if self._in_tool_call_phase(state):
            if not state.tool_call_text_started:
                state.tool_call_text_started = True
                state.previous_text = ""
                state.previous_token_ids = []
                delta_text = current_text
                delta_token_ids = current_token_ids

            reasoning_from_this_batch = (
                delta_message.reasoning if delta_message else None
            )

            delta_message, state.function_name_returned = (
                self._extract_tool_calls_streaming(
                    previous_text=state.previous_text,
                    current_text=current_text,
                    delta_text=delta_text,
                    previous_token_ids=state.previous_token_ids,
                    current_token_ids=current_token_ids,
                    delta_token_ids=delta_token_ids,
                    request=request,  # type: ignore[arg-type]
                    tool_call_idx=state.history_tool_call_cnt,
                    tool_call_id_type=state.tool_call_id_type,
                    function_name_returned=state.function_name_returned,
                )
            )

            if reasoning_from_this_batch:
                if delta_message is None:
                    delta_message = DeltaMessage(reasoning=reasoning_from_this_batch)
                elif not delta_message.reasoning:
                    delta_message.reasoning = reasoning_from_this_batch

            if (
                delta_message
                and delta_message.tool_calls
                and delta_message.tool_calls[0].id is not None
            ):
                state.history_tool_call_cnt += 1

        # No phase active: pass through as content.
        # Skip when reasoning just ended in this delta — the engine already
        # consumed the end-of-reasoning marker (e.g. </think>) and
        # delta_text still contains the raw marker text.
        if (
            delta_message is None
            and not reasoning_transitioned
            and not self._in_reasoning_phase(state)
            and not self._in_tool_call_phase(state)
        ):
            delta_message = DeltaMessage(content=delta_text)

        state.commit(current_text, current_token_ids)

        if finished:
            delta_message = self.finalize_generation(delta_message, request, state)
            delta_message = self._flush_engine_parsers(delta_message)

        return delta_message

    def _flush_engine_parsers(
        self, delta_message: DeltaMessage | None
    ) -> DeltaMessage | None:
        """Flush buffered state from engine-based parsers at stream end."""
        reasoning_ended = self._stream_state.reasoning_ended
        for parser in (self._reasoning_parser, self._tool_parser):
            if not getattr(parser, "engine_based_streaming", False):
                continue
            # When reasoning has ended and we transitioned to the tool
            # phase, the reasoning parser's engine may still have buffered
            # characters from tool-call markup it saw with
            # skip_tool_parsing=True.  Flushing that would leak spurious
            # content (e.g. a stray '"'), so skip it.
            if parser is self._reasoning_parser and reasoning_ended:
                continue
            finish = getattr(parser, "finish_streaming", None)
            if finish is None:
                continue
            flush_delta = finish()
            if flush_delta is None:
                continue
            if delta_message is None:
                delta_message = flush_delta
            else:
                if flush_delta.content:
                    delta_message.content = (
                        delta_message.content or ""
                    ) + flush_delta.content
                if flush_delta.reasoning:
                    delta_message.reasoning = (
                        delta_message.reasoning or ""
                    ) + flush_delta.reasoning
                if flush_delta.tool_calls:
                    delta_message.tool_calls = (
                        delta_message.tool_calls or []
                    ) + flush_delta.tool_calls
        return delta_message

_append_unstreamed_tool_args(delta_message)

Append parsed-but-unstreamed tool-call arguments to delta_message.

Source code in vllm/parser/abstract_parser.py
def _append_unstreamed_tool_args(
    self,
    delta_message: DeltaMessage | None,
) -> None:
    """Append parsed-but-unstreamed tool-call arguments to *delta_message*."""
    if (
        self._tool_parser is not None
        and delta_message
        and delta_message.tool_calls
        and (last_tc := delta_message.tool_calls[-1]).function
    ):
        last_tc.function.arguments = (
            last_tc.function.arguments or ""
        ) + self._tool_parser.get_remaining_unstreamed_args()

_flush_engine_parsers(delta_message)

Flush buffered state from engine-based parsers at stream end.

Source code in vllm/parser/abstract_parser.py
def _flush_engine_parsers(
    self, delta_message: DeltaMessage | None
) -> DeltaMessage | None:
    """Flush buffered state from engine-based parsers at stream end."""
    reasoning_ended = self._stream_state.reasoning_ended
    for parser in (self._reasoning_parser, self._tool_parser):
        if not getattr(parser, "engine_based_streaming", False):
            continue
        # When reasoning has ended and we transitioned to the tool
        # phase, the reasoning parser's engine may still have buffered
        # characters from tool-call markup it saw with
        # skip_tool_parsing=True.  Flushing that would leak spurious
        # content (e.g. a stray '"'), so skip it.
        if parser is self._reasoning_parser and reasoning_ended:
            continue
        finish = getattr(parser, "finish_streaming", None)
        if finish is None:
            continue
        flush_delta = finish()
        if flush_delta is None:
            continue
        if delta_message is None:
            delta_message = flush_delta
        else:
            if flush_delta.content:
                delta_message.content = (
                    delta_message.content or ""
                ) + flush_delta.content
            if flush_delta.reasoning:
                delta_message.reasoning = (
                    delta_message.reasoning or ""
                ) + flush_delta.reasoning
            if flush_delta.tool_calls:
                delta_message.tool_calls = (
                    delta_message.tool_calls or []
                ) + flush_delta.tool_calls
    return delta_message

finalize_generation(delta_message, request, state)

Finalize generation for cases where generation was incomplete. For example, if streaming terminated before reasoning ended

Source code in vllm/parser/abstract_parser.py
def finalize_generation(
    self,
    delta_message: DeltaMessage | None,
    request: ChatCompletionRequest | ResponsesRequest,
    state: StreamState,
) -> DeltaMessage | None:
    """Finalize generation for cases where generation was incomplete.
    For example, if streaming terminated before reasoning ended
    """
    fallback_fn = getattr(
        self._reasoning_parser, "get_streaming_fallback_content", None
    )
    if fallback_fn is not None and not state.reasoning_ended:
        promoted = fallback_fn(state.previous_text, request)
        if promoted:
            if delta_message is None:
                delta_message = DeltaMessage()
            delta_message.content = (delta_message.content or "") + promoted

    self._append_unstreamed_tool_args(delta_message)
    return delta_message

Parser

Abstract Parser class that unifies ReasoningParser and ToolParser into a single interface for parsing model output.

This class provides a unified way to handle both reasoning extraction (e.g., chain-of-thought content in tags) and tool call extraction (e.g., function calls in XML/JSON format) from model outputs.

Subclasses can either: 1. Override the abstract methods directly for custom parsing logic 2. Set reasoning_parser and tool_parser properties to delegate to existing parser implementations

Class Attributes

reasoning_parser_cls: The ReasoningParser class to use (for compatibility with code that needs the class, not instance). tool_parser_cls: The ToolParser class to use (for compatibility with code that needs the class, not instance).

Methods:

Attributes:

Source code in vllm/parser/abstract_parser.py
class Parser:
    """
    Abstract Parser class that unifies ReasoningParser and ToolParser into
    a single interface for parsing model output.

    This class provides a unified way to handle both reasoning extraction
    (e.g., chain-of-thought content in <think> tags) and tool call extraction
    (e.g., function calls in XML/JSON format) from model outputs.

    Subclasses can either:
    1. Override the abstract methods directly for custom parsing logic
    2. Set `reasoning_parser` and `tool_parser` properties to delegate to
       existing parser implementations

    Class Attributes:
        reasoning_parser_cls: The ReasoningParser class to use (for compatibility
            with code that needs the class, not instance).
        tool_parser_cls: The ToolParser class to use (for compatibility with
            code that needs the class, not instance).
    """

    # Class-level parser classes for compatibility with existing patterns
    # Subclasses should override these if they use specific parser classes
    reasoning_parser_cls: type[ReasoningParser] | None = None
    tool_parser_cls: type[ToolParser] | None = None

    def __init__(
        self,
        tokenizer: TokenizerLike,
        tools: list[Tool] | None = None,
        *args,
        **kwargs,
    ):
        self.model_tokenizer = tokenizer
        self._reasoning_parser: ReasoningParser | None = None
        self._tool_parser: ToolParser | None = None
        if self.__class__.reasoning_parser_cls is not None:
            self._reasoning_parser = self.__class__.reasoning_parser_cls(
                tokenizer, *args, **kwargs
            )
        if self.__class__.tool_parser_cls is not None:
            self._tool_parser = self.__class__.tool_parser_cls(tokenizer, tools)

        self._engine_based = (
            self._reasoning_parser is None
            or self._reasoning_parser.engine_based_streaming
        ) and (self._tool_parser is None or self._tool_parser.engine_based_streaming)
        self._stream_state = StreamState(engine_based=self._engine_based)

    @cached_property
    def vocab(self) -> dict[str, int]:
        """Get the vocabulary mapping from tokens to IDs."""
        return self.model_tokenizer.get_vocab()

    @property
    def reasoning_parser(self) -> ReasoningParser | None:
        """The underlying reasoning parser, if any."""
        return self._reasoning_parser

    @reasoning_parser.setter
    def reasoning_parser(self, parser: ReasoningParser | None) -> None:
        self._reasoning_parser = parser

    @property
    def tool_parser(self) -> ToolParser | None:
        """The underlying tool parser, if any."""
        return self._tool_parser

    @tool_parser.setter
    def tool_parser(self, parser: ToolParser | None) -> None:
        self._tool_parser = parser

    # ========== Reasoning Parser Methods ==========

    @abstractmethod
    def is_reasoning_end(self, input_ids: list[int]) -> bool:
        """
        Check if the reasoning content ends in the input_ids.

        Used by structured engines like `xgrammar` to check if the
        reasoning content ends in the model output.

        Args:
            input_ids: The token IDs of the model output.

        Returns:
            True if the reasoning content ends in the input_ids.
        """

    def is_reasoning_end_streaming(
        self, input_ids: list[int], delta_ids: list[int]
    ) -> bool:
        """
        Check if the reasoning content ends during a decode step.

        Args:
            input_ids: The entire model output token IDs.
            delta_ids: The last few computed tokens at the current decode step.

        Returns:
            True if the reasoning content ends in the delta_ids.
        """
        return self.is_reasoning_end(input_ids)

    @abstractmethod
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        """
        Extract content token IDs from the input_ids.

        This extracts the non-reasoning content (e.g., everything after
        the </think> tag).

        Args:
            input_ids: The token IDs of the model output.

        Returns:
            The extracted content token IDs.
        """

    @abstractmethod
    def extract_reasoning(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> tuple[str | None, str | None]:
        """
        Extract reasoning content from a complete model-generated string.

        Used for non-streaming responses where we have the entire model
        response available before sending to the client.

        Args:
            model_output: The complete model-generated string.
            request: The request object used to generate the output.

        Returns:
            A tuple of (reasoning, response_content).
        """

    @abstractmethod
    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extract reasoning content from a streaming delta message.

        Args:
            previous_text: Text from all previous tokens.
            current_text: Text including the current delta.
            delta_text: The new text in this delta.
            previous_token_ids: Token IDs from previous generation.
            current_token_ids: All token IDs including current.
            delta_token_ids: The new token IDs in this delta.

        Returns:
            A DeltaMessage with reasoning and/or content fields, or None.
        """

    # ========== Tool Parser Methods ==========

    def adjust_request(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
        """
        Adjust the request parameters for tool calling.

        Can be overridden by subclasses to modify request parameters
        (e.g., setting structured output schemas for tool calling).

        Args:
            request: The original request.

        Returns:
            The adjusted request.
        """
        return request

    @abstractmethod
    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> ExtractedToolCallInformation:
        """
        Extract tool calls from a complete model-generated string.

        Used for non-streaming responses.

        Args:
            model_output: The complete model-generated string.
            request: The request object used to generate the output.

        Returns:
            ExtractedToolCallInformation containing the tool calls.
        """

    @abstractmethod
    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> DeltaMessage | None:
        """
        Extract tool calls from a streaming delta message.

        Args:
            previous_text: Text from all previous tokens.
            current_text: Text including the current delta.
            delta_text: The new text in this delta.
            previous_token_ids: Token IDs from previous generation.
            current_token_ids: All token IDs including current.
            delta_token_ids: The new token IDs in this delta.
            request: The request object.

        Returns:
            A DeltaMessage with tool_calls field, or None.
        """

    @abstractmethod
    def parse(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
        enable_auto_tools: bool = False,
        model_output_token_ids: Sequence[int] = (),
    ) -> tuple[str | None, str | None, list[FunctionCall] | None]:
        """Parse a complete model output, extracting reasoning and tool calls.

        Args:
            model_output: The complete model-generated string.
            request: The request object used to generate the output.
            enable_auto_tools: Whether to enable automatic tool call parsing.
            model_output_token_ids: The generated raw output token IDs.

        Returns:
            A tuple of (reasoning, content, tool_calls).
        """

    @abstractmethod
    def parse_delta(
        self,
        delta_text: str,
        delta_token_ids: list[int],
        request: ChatCompletionRequest | ResponsesRequest,
        prompt_token_ids: list[int] | None = None,
        *,
        finished: bool,
    ) -> DeltaMessage | None:
        """Parse a single streaming delta, orchestrating reasoning then
        tool call extraction via internal stream state.
        """

reasoning_parser property writable

The underlying reasoning parser, if any.

tool_parser property writable

The underlying tool parser, if any.

vocab cached property

Get the vocabulary mapping from tokens to IDs.

adjust_request(request)

Adjust the request parameters for tool calling.

Can be overridden by subclasses to modify request parameters (e.g., setting structured output schemas for tool calling).

Parameters:

Returns:

Source code in vllm/parser/abstract_parser.py
def adjust_request(
    self, request: ChatCompletionRequest | ResponsesRequest
) -> ChatCompletionRequest | ResponsesRequest:
    """
    Adjust the request parameters for tool calling.

    Can be overridden by subclasses to modify request parameters
    (e.g., setting structured output schemas for tool calling).

    Args:
        request: The original request.

    Returns:
        The adjusted request.
    """
    return request

extract_content_ids(input_ids) abstractmethod

Extract content token IDs from the input_ids.

This extracts the non-reasoning content (e.g., everything after the tag).

Parameters:

  • input_ids

    (list[int]) –

    The token IDs of the model output.

Returns:

  • list[int]

    The extracted content token IDs.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
    """
    Extract content token IDs from the input_ids.

    This extracts the non-reasoning content (e.g., everything after
    the </think> tag).

    Args:
        input_ids: The token IDs of the model output.

    Returns:
        The extracted content token IDs.
    """

extract_reasoning(model_output, request) abstractmethod

Extract reasoning content from a complete model-generated string.

Used for non-streaming responses where we have the entire model response available before sending to the client.

Parameters:

Returns:

  • tuple[str | None, str | None]

    A tuple of (reasoning, response_content).

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_reasoning(
    self,
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
    """
    Extract reasoning content from a complete model-generated string.

    Used for non-streaming responses where we have the entire model
    response available before sending to the client.

    Args:
        model_output: The complete model-generated string.
        request: The request object used to generate the output.

    Returns:
        A tuple of (reasoning, response_content).
    """

extract_reasoning_streaming(previous_text, current_text, delta_text, previous_token_ids, current_token_ids, delta_token_ids) abstractmethod

Extract reasoning content from a streaming delta message.

Parameters:

  • previous_text

    (str) –

    Text from all previous tokens.

  • current_text

    (str) –

    Text including the current delta.

  • delta_text

    (str) –

    The new text in this delta.

  • previous_token_ids

    (Sequence[int]) –

    Token IDs from previous generation.

  • current_token_ids

    (Sequence[int]) –

    All token IDs including current.

  • delta_token_ids

    (Sequence[int]) –

    The new token IDs in this delta.

Returns:

  • DeltaMessage | None

    A DeltaMessage with reasoning and/or content fields, or None.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_reasoning_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
    """
    Extract reasoning content from a streaming delta message.

    Args:
        previous_text: Text from all previous tokens.
        current_text: Text including the current delta.
        delta_text: The new text in this delta.
        previous_token_ids: Token IDs from previous generation.
        current_token_ids: All token IDs including current.
        delta_token_ids: The new token IDs in this delta.

    Returns:
        A DeltaMessage with reasoning and/or content fields, or None.
    """

extract_tool_calls(model_output, request) abstractmethod

Extract tool calls from a complete model-generated string.

Used for non-streaming responses.

Parameters:

Returns:

  • ExtractedToolCallInformation

    ExtractedToolCallInformation containing the tool calls.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> ExtractedToolCallInformation:
    """
    Extract tool calls from a complete model-generated string.

    Used for non-streaming responses.

    Args:
        model_output: The complete model-generated string.
        request: The request object used to generate the output.

    Returns:
        ExtractedToolCallInformation containing the tool calls.
    """

extract_tool_calls_streaming(previous_text, current_text, delta_text, previous_token_ids, current_token_ids, delta_token_ids, request) abstractmethod

Extract tool calls from a streaming delta message.

Parameters:

  • previous_text

    (str) –

    Text from all previous tokens.

  • current_text

    (str) –

    Text including the current delta.

  • delta_text

    (str) –

    The new text in this delta.

  • previous_token_ids

    (Sequence[int]) –

    Token IDs from previous generation.

  • current_token_ids

    (Sequence[int]) –

    All token IDs including current.

  • delta_token_ids

    (Sequence[int]) –

    The new token IDs in this delta.

  • request

    (ChatCompletionRequest | ResponsesRequest) –

    The request object.

Returns:

  • DeltaMessage | None

    A DeltaMessage with tool_calls field, or None.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest | ResponsesRequest,
) -> DeltaMessage | None:
    """
    Extract tool calls from a streaming delta message.

    Args:
        previous_text: Text from all previous tokens.
        current_text: Text including the current delta.
        delta_text: The new text in this delta.
        previous_token_ids: Token IDs from previous generation.
        current_token_ids: All token IDs including current.
        delta_token_ids: The new token IDs in this delta.
        request: The request object.

    Returns:
        A DeltaMessage with tool_calls field, or None.
    """

is_reasoning_end(input_ids) abstractmethod

Check if the reasoning content ends in the input_ids.

Used by structured engines like xgrammar to check if the reasoning content ends in the model output.

Parameters:

  • input_ids

    (list[int]) –

    The token IDs of the model output.

Returns:

  • bool

    True if the reasoning content ends in the input_ids.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def is_reasoning_end(self, input_ids: list[int]) -> bool:
    """
    Check if the reasoning content ends in the input_ids.

    Used by structured engines like `xgrammar` to check if the
    reasoning content ends in the model output.

    Args:
        input_ids: The token IDs of the model output.

    Returns:
        True if the reasoning content ends in the input_ids.
    """

is_reasoning_end_streaming(input_ids, delta_ids)

Check if the reasoning content ends during a decode step.

Parameters:

  • input_ids

    (list[int]) –

    The entire model output token IDs.

  • delta_ids

    (list[int]) –

    The last few computed tokens at the current decode step.

Returns:

  • bool

    True if the reasoning content ends in the delta_ids.

Source code in vllm/parser/abstract_parser.py
def is_reasoning_end_streaming(
    self, input_ids: list[int], delta_ids: list[int]
) -> bool:
    """
    Check if the reasoning content ends during a decode step.

    Args:
        input_ids: The entire model output token IDs.
        delta_ids: The last few computed tokens at the current decode step.

    Returns:
        True if the reasoning content ends in the delta_ids.
    """
    return self.is_reasoning_end(input_ids)

parse(model_output, request, enable_auto_tools=False, model_output_token_ids=()) abstractmethod

Parse a complete model output, extracting reasoning and tool calls.

Parameters:

  • model_output

    (str) –

    The complete model-generated string.

  • request

    (ChatCompletionRequest | ResponsesRequest) –

    The request object used to generate the output.

  • enable_auto_tools

    (bool, default: False ) –

    Whether to enable automatic tool call parsing.

  • model_output_token_ids

    (Sequence[int], default: () ) –

    The generated raw output token IDs.

Returns:

  • tuple[str | None, str | None, list[FunctionCall] | None]

    A tuple of (reasoning, content, tool_calls).

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def parse(
    self,
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
    enable_auto_tools: bool = False,
    model_output_token_ids: Sequence[int] = (),
) -> tuple[str | None, str | None, list[FunctionCall] | None]:
    """Parse a complete model output, extracting reasoning and tool calls.

    Args:
        model_output: The complete model-generated string.
        request: The request object used to generate the output.
        enable_auto_tools: Whether to enable automatic tool call parsing.
        model_output_token_ids: The generated raw output token IDs.

    Returns:
        A tuple of (reasoning, content, tool_calls).
    """

parse_delta(delta_text, delta_token_ids, request, prompt_token_ids=None, *, finished) abstractmethod

Parse a single streaming delta, orchestrating reasoning then tool call extraction via internal stream state.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def parse_delta(
    self,
    delta_text: str,
    delta_token_ids: list[int],
    request: ChatCompletionRequest | ResponsesRequest,
    prompt_token_ids: list[int] | None = None,
    *,
    finished: bool,
) -> DeltaMessage | None:
    """Parse a single streaming delta, orchestrating reasoning then
    tool call extraction via internal stream state.
    """

StreamState dataclass

Mutable state for Parser.parse_delta(). One per stream.

Source code in vllm/parser/abstract_parser.py
@dataclass
class StreamState:
    """Mutable state for ``Parser.parse_delta()``. One per stream."""

    reasoning_ended: bool = False
    tool_call_text_started: bool = False
    prompt_reasoning_checked: bool = False
    previous_text: str = ""
    previous_token_ids: list[int] = field(default_factory=list)
    history_tool_call_cnt: int = 0
    tool_call_id_type: str = "random"
    # only used for "required" and "named tool" choices,
    # tracks whether function name has been fully returned in the stream yet
    function_name_returned: bool = False
    engine_based: bool = False

    def advance(
        self,
        delta_text: str,
        delta_token_ids: list[int],
    ) -> tuple[str, list[int]]:
        if self.engine_based:
            return delta_text, delta_token_ids
        return (
            self.previous_text + delta_text,
            self.previous_token_ids + delta_token_ids,
        )

    def commit(
        self,
        current_text: str,
        current_token_ids: list[int],
    ) -> None:
        if self.engine_based:
            self.previous_text = ""
            self.previous_token_ids = []
        else:
            self.previous_text = current_text
            self.previous_token_ids = current_token_ids