InternLM · lvhan028 · Apr 20, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -41,7 +41,8 @@
                                             GenerateReqInput, GenerateReqMetaOutput, GenerateReqOutput, LogProbs,
                                             ModelCard, ModelList, ModelPermission, PoolingRequest, PoolingResponse,
                                             TopLogprob, UpdateParamsRequest, UsageInfo)
-from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import ReasoningParser, ReasoningParserManager
+from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import (ReasoningParser, ReasoningParserManager,
+                                                                     get_streaming_state)
 from lmdeploy.serve.openai.tool_parser.tool_parser import ToolParser, ToolParserManager
 from lmdeploy.serve.utils.server_utils import validate_json_request
 from lmdeploy.tokenizer import DetokenizeState, Tokenizer
@@ -505,13 +506,10 @@ def create_stream_response_json(index: int,
         return response_json
 
     async def completion_stream_generator() -> AsyncGenerator[str, None]:
-        previous_text = ''
-        current_text = ''
-        previous_token_ids = []
-        current_token_ids = []
-        delta_token_ids = []
         has_parser = VariableInterface.tool_parser is not None or VariableInterface.reasoning_parser is not None
         streaming_tools = False
+        # Shared state for streaming parsers (previous/current text & token ids)
+        parser_state = get_streaming_state(request) if has_parser else None
         async for res in result_generator:
             logprobs, usage = None, None
             if gen_logprobs and res.logprobs:
@@ -533,20 +531,13 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                     res.finish_reason = 'tool_calls'
             else:
                 delta_message = DeltaMessage(role='assistant', content=res.response)
-                if has_parser:
-                    current_text = current_text + res.response
-                    current_token_ids = current_token_ids + delta_token_ids
+                if parser_state is not None:
+                    parser_state.update(res.response, delta_token_ids)
                 if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
                     if res.finish_reason == 'stop' and streaming_tools is True:
                         res.finish_reason = 'tool_calls'
                     tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
-                        previous_text=previous_text,
-                        current_text=current_text,
-                        delta_text=delta_message.content,
-                        previous_token_ids=previous_token_ids,
-                        current_token_ids=current_token_ids,
-                        delta_token_ids=delta_token_ids,
-                        request=request)
+                        delta_text=delta_message.content, delta_token_ids=delta_token_ids, request=request)
                     if tool_delta is not None:
                         delta_message.tool_calls = tool_delta.tool_calls
                         delta_message.content = tool_delta.content
@@ -557,18 +548,12 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                     logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
                 if VariableInterface.reasoning_parser is not None and enable_thinking is not False:
                     reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
-                        previous_text=previous_text,
-                        current_text=current_text,
-                        delta_text=delta_message.content or '',
-                        previous_token_ids=previous_token_ids,
-                        current_token_ids=current_token_ids,
-                        delta_token_ids=delta_token_ids)
+                        delta_text=delta_message.content or '', delta_token_ids=delta_token_ids, request=request)
                     if reasoning_delta is not None:
                         delta_message.reasoning_content = reasoning_delta.reasoning_content
                         delta_message.content = reasoning_delta.content
-                if has_parser:
-                    previous_text = current_text
-                    previous_token_ids = current_token_ids
+                if parser_state is not None:
+                    parser_state.step()
             if request.return_token_ids:
                 delta_message.gen_tokens = delta_token_ids
             response_json = create_stream_response_json(index=0,

diff --git a/lmdeploy/serve/openai/reasoning_parser/__init__.py b/lmdeploy/serve/openai/reasoning_parser/__init__.py
@@ -1,6 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from .qwen_qwq_reasoning_parser import QwenQwQReasoningParser
-from .reasoning_parser import ReasoningParser, ReasoningParserManager
+from .qwen_reasoning_parser import QwenQwQReasoningParser
+from .reasoning_parser import (ReasoningParser, ReasoningParserManager, StreamingParserState, ThinkingReasoningParser,
+                               get_streaming_state)
 
-__all__ = ['ReasoningParser', 'ReasoningParserManager', 'DeepSeekR1ReasoningParser', 'QwenQwQReasoningParser']
+__all__ = [
+    'ReasoningParser',
+    'ReasoningParserManager',
+    'StreamingParserState',
+    'ThinkingReasoningParser',
+    'get_streaming_state',
+    'DeepSeekR1ReasoningParser',
+    'QwenQwQReasoningParser',
+]
diff --git a/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py
@@ -1,140 +1,25 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/reasoning_parsers
-import re
-from typing import Optional, Sequence, Tuple, Union
-
-from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage
-
-from .reasoning_parser import ReasoningParser, ReasoningParserManager
+from .reasoning_parser import ReasoningParserManager, ThinkingReasoningParser
 
 
 @ReasoningParserManager.register_module(name='deepseek-r1')
-class DeepSeekR1ReasoningParser(ReasoningParser):
+class DeepSeekR1ReasoningParser(ThinkingReasoningParser):
     """Reasoning parser for DeepSeek R1 model.
 
-    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning text. This parser extracts the reasoning
-    content from the model output.
+    Uses <think>...</think> tokens. When the end tag is missing in
+    non-streaming mode, the entire output is treated as reasoning content
+    (DeepSeek R1 may omit the start tag).
+
+    Ref: https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
     """
 
+    start_token = '<think>'
+    end_token = '</think>'
+    strip_newlines = False
+    on_missing_end_tag = 'reasoning'
+
     def __init__(self, tokenizer: object):
         super().__init__(tokenizer)
-        self.think_start_token = '<think>'
-        self.think_end_token = '</think>'
-
-        self.reasoning_regex = re.compile(rf'{self.think_start_token}(.*?){self.think_end_token}', re.DOTALL)
-
-        if not self.model_tokenizer:
-            raise ValueError('The model tokenizer must be passed to the ReasoningParser '
-                             'constructor during construction.')
-
-        self.think_start_token_id = self.vocab.get(self.think_start_token)
-        self.think_end_token_id = self.vocab.get(self.think_end_token)
-        if (self.think_start_token_id is None or self.think_end_token_id is None):
-            raise RuntimeError('DeepSeek R1 reasoning parser could not locate think start/end '
-                               'tokens in the tokenizer!')
-
-    def extract_reasoning_content_streaming(
-        self,
-        previous_text: str,
-        current_text: str,
-        delta_text: str,
-        previous_token_ids: Sequence[int],
-        current_token_ids: Sequence[int],
-        delta_token_ids: Sequence[int],
-        **kwargs,
-    ) -> Union[DeltaMessage, None]:
-        """Instance method that should be implemented for extracting reasoning
-        from an incomplete response; for use when handling reasoning calls and
-        streaming.
-
-        Has to be an instance method because  it requires state - the current tokens/diffs, but also the information
-        about what has previously been parsed and extracted (see constructor)
-        """
-        # Skip single special tokens
-        if len(delta_token_ids) == 1:
-            if delta_token_ids[0] == self.think_end_token_id:
-                return DeltaMessage(content='')
-            elif delta_token_ids[0] == self.think_start_token_id:
-                return None
-
-        # Check if <think> is present in previous or delta.
-        # Keep compatibility with models that don't generate <think> tokens.
-        if self.think_start_token_id in previous_token_ids:
-            if self.think_end_token_id in delta_token_ids:
-                # <think> in previous, </think> in delta,
-                # extract reasoning content
-                end_index = delta_text.find(self.think_end_token)
-                reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
-                # <think> in previous, </think> in previous,
-                return DeltaMessage(content=delta_text)
-            else:
-                # <think> in previous, no </think> in previous or delta,
-                # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
-        elif self.think_start_token_id in delta_token_ids:
-            if self.think_end_token_id in delta_token_ids:
-                # <think> in delta, </think> in delta, extract reasoning content
-                start_index = delta_text.find(self.think_start_token)
-                end_index = delta_text.find(self.think_end_token)
-                reasoning_content = delta_text[start_index + len(self.think_start_token):end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
-            else:
-                # <think> in delta, no </think> in delta,
-                # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
-        else:
-            # No <think> in previous or delta, also need to check for </think>.
-            # Because the model may have generated </think> without <think>
-            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-            if self.think_end_token_id in delta_token_ids:
-                # </think> in delta with more tokens,
-                # extract reasoning content and content
-                end_index = delta_text.find(self.think_end_token)
-                reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
-                # </think> in previous, thinking content ends
-                return DeltaMessage(content=delta_text)
-            else:
-                # no </think> in previous or delta, reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
-
-    def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
-                                  **kwargs) -> Tuple[Optional[str], Optional[str]]:
-        """Extract reasoning content from a complete model-generated string.
-
-        Used for non-streaming responses where we have the entire model response
-        available before sending to the client.
-
-        Args:
-            model_output (str): The model-generated string to extract reasoning content from.
-            request (ChatCompletionRequest): he request object that was used to generate the model_output.
-
-        Returns:
-            reasoning_content (str | None): The reasoning content.
-            final_output (str | None): The content.
-        """
-        # DeepSeek R1 doesn't generate <think> now.
-        # Thus we assume the reasoning content is always at the start.
-        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-        if self.think_end_token not in model_output:
-            return model_output, None
-        else:
-            # Add a start token if it's missing to keep compatibility.
-            if self.think_start_token not in model_output:
-                model_output = f'{self.think_start_token}{model_output}'
-            # Use a regex to find the reasoning content
-            reasoning_content = self.reasoning_regex.findall(model_output)[0]
-
-            end_index = len(f'{self.think_start_token}{reasoning_content}{self.think_end_token}')
-            final_output = model_output[end_index:]
-
-            if len(final_output) == 0:
-                return reasoning_content, None
-
-            return reasoning_content, final_output
+        if self.start_token_id is None or self.end_token_id is None:
+            raise RuntimeError('DeepSeek R1 reasoning parser could not locate '
+                               'think start/end tokens in the tokenizer!')