Skip to content

vllm.reasoning.minimax_m3_reasoning_parser

Classes:

MiniMaxM3ReasoningParser

Bases: BaseThinkingReasoningParser

Reasoning parser for MiniMax M3 explicit thinking blocks.

MiniMax M3 emits reasoning as:

<mm:think>reasoning text</mm:think>assistant content

The M3 tokenizer exposes both markers as complete vocabulary tokens. The chat template may also prefill the start marker when thinking_mode="enabled", so generated text can begin directly inside a reasoning block without emitting <mm:think> again.

Source code in vllm/reasoning/minimax_m3_reasoning_parser.py
class MiniMaxM3ReasoningParser(BaseThinkingReasoningParser):
    """Reasoning parser for MiniMax M3 explicit thinking blocks.

    MiniMax M3 emits reasoning as:

        <mm:think>reasoning text</mm:think>assistant content

    The M3 tokenizer exposes both markers as complete vocabulary tokens. The
    chat template may also prefill the start marker when
    ``thinking_mode="enabled"``, so generated text can begin directly inside a
    reasoning block without emitting ``<mm:think>`` again.
    """

    @property
    def start_token(self) -> str:
        return "<mm:think>"

    @property
    def end_token(self) -> str:
        return "</mm:think>"

    def __init__(self, tokenizer, *args, **kwargs):
        super().__init__(tokenizer, *args, **kwargs)
        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
        self._initial_in_reasoning = chat_kwargs.get("thinking_mode") == "enabled"
        self._at_response_start = True

    def extract_reasoning(
        self,
        model_output: str,
        request: "ChatCompletionRequest | ResponsesRequest",
    ) -> tuple[str | None, str | None]:
        # MiniMax M3 can start a response with a stray closer. Drop that first
        # token only; later unmatched closers stay visible as content.
        if not self._initial_in_reasoning and model_output.startswith(self.end_token):
            content = model_output[len(self.end_token) :]
            return None, content or None

        if self._initial_in_reasoning and self.start_token not in model_output:
            reasoning, end, content = model_output.partition(self.end_token)
            if not end:
                return model_output, None
            return reasoning, content or None

        if self.start_token not in model_output:
            return None, model_output

        content_before, _, after_start = model_output.partition(self.start_token)
        reasoning, end, content_after = after_start.partition(self.end_token)
        if not end:
            return reasoning, content_before or None

        return reasoning, (content_before + content_after) or None

    def is_reasoning_end_streaming(
        self, input_ids: Sequence[int], delta_ids: Iterable[int]
    ) -> bool:
        delta_ids = tuple(delta_ids)
        if self.end_token_id in delta_ids:
            return True
        if self.end_token_id in input_ids:
            return True
        if self._initial_in_reasoning:
            return False
        if self.start_token_id not in input_ids:
            return bool(input_ids)
        return False

    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        if self.end_token_id in input_ids:
            end_index = len(input_ids) - 1 - input_ids[::-1].index(self.end_token_id)
            return input_ids[end_index + 1 :]

        if self._initial_in_reasoning and self.start_token_id not in input_ids:
            return []

        if self.start_token_id not in input_ids:
            return input_ids
        return []

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        if not delta_text:
            return None

        if self._at_response_start and not self._initial_in_reasoning:
            # Apply the leading-closer tolerance once. Later unmatched closers
            # stay visible as content.
            self._at_response_start = False
            if delta_text.startswith(self.end_token):
                delta_text = delta_text[len(self.end_token) :]
                if not delta_text:
                    return None
                if delta_token_ids and delta_token_ids[0] == self.end_token_id:
                    delta_token_ids = delta_token_ids[1:]

        if self.end_token_id in previous_token_ids:
            return DeltaMessage(content=delta_text)

        if (
            self._initial_in_reasoning
            and self.start_token_id not in previous_token_ids
            and self.start_token_id not in delta_token_ids
        ):
            if self.end_token_id in delta_token_ids:
                reasoning, _, content = delta_text.partition(self.end_token)
                return DeltaMessage(
                    reasoning=reasoning or None,
                    content=content or None,
                )
            return DeltaMessage(reasoning=delta_text)

        if (
            self.start_token_id not in previous_token_ids
            and self.start_token_id not in delta_token_ids
        ):
            return DeltaMessage(content=delta_text)

        if self.end_token_id in delta_token_ids:
            reasoning_text, _, content = delta_text.partition(self.end_token)
            if self.start_token_id in delta_token_ids:
                _, _, reasoning_text = reasoning_text.partition(self.start_token)
            return DeltaMessage(
                reasoning=reasoning_text or None,
                content=content or None,
            )

        if self.start_token_id in delta_token_ids:
            _, _, reasoning = delta_text.partition(self.start_token)
            return DeltaMessage(reasoning=reasoning) if reasoning else None

        return DeltaMessage(reasoning=delta_text)

    def count_reasoning_tokens(self, token_ids: Sequence[int]) -> int:
        if not self._initial_in_reasoning:
            return super().count_reasoning_tokens(token_ids)

        count = 0
        depth = 1
        for token_id in token_ids:
            if token_id == self.start_token_id:
                depth += 1
                continue
            if token_id == self.end_token_id:
                if depth > 0:
                    depth -= 1
                continue
            if depth > 0:
                count += 1
        return count