Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
db3f184
improve reasoning parser
lvhan028 Mar 10, 2026
1572900
rename file
lvhan028 Mar 10, 2026
b895d53
minor fix
lvhan028 Mar 10, 2026
e8274c3
merge main
lvhan028 Mar 26, 2026
35b404c
refactor
lvhan028 Mar 26, 2026
c516394
update deepseek reasoning parser ut
lvhan028 Mar 26, 2026
dea23e0
Merge branch 'main' into improve-parsers
lvhan028 Mar 26, 2026
390d2ed
Merge branch 'main' into improve-parsers
lvhan028 Mar 27, 2026
d3eb973
agent's first refactor version
lvhan028 Mar 30, 2026
bc0502e
agent's 2nd refactor version
lvhan028 Apr 1, 2026
904490d
agent's 3rd refactor version
lvhan028 Apr 1, 2026
92eb62c
the 4-th version
lvhan028 Apr 1, 2026
754cf55
the 4-th version
lvhan028 Apr 1, 2026
39ca371
fix
lvhan028 Apr 1, 2026
525eb87
type hint
lvhan028 Apr 1, 2026
dd1280b
remove unused code
lvhan028 Apr 1, 2026
d028118
fix
lvhan028 Apr 2, 2026
f82998a
rename file test_qwen3_parser.py
lvhan028 Apr 2, 2026
47b3a68
Merge branch 'main' into improve-parsers
lvhan028 Apr 2, 2026
2f8208f
update qwen3.5 parsers tc
lvhan028 Apr 2, 2026
ee2f752
fix dump tools
lvhan028 Apr 2, 2026
2d6ffee
update exception
lvhan028 Apr 2, 2026
da3e868
reorg
lvhan028 Apr 3, 2026
8678cab
gpt-oss
lvhan028 Apr 4, 2026
c785a91
parser -> parsers
lvhan028 Apr 9, 2026
86631d2
merge main
lvhan028 Apr 15, 2026
4b55122
fix gpt-oss parser
lvhan028 Apr 15, 2026
f299c67
fix
lvhan028 Apr 15, 2026
2e05aa3
fix intern-s1 tool parser
lvhan028 Apr 15, 2026
0a58a51
fix intern-s1 parsers
lvhan028 Apr 15, 2026
cfc65a9
fix qwen3coder parser
lvhan028 Apr 17, 2026
d02562c
take schema into consideration
lvhan028 Apr 17, 2026
9025c1e
fix qwen3 tool parser
lvhan028 Apr 20, 2026
1c63aae
fix llama3
lvhan028 Apr 20, 2026
4e862ef
set spaces_between_special_tokens=False for internlm tool parser
lvhan028 Apr 20, 2026
0ced400
fix gpt-oss
lvhan028 Apr 20, 2026
6b56ec9
fix gpt-oss
lvhan028 Apr 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 10 additions & 25 deletions lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
GenerateReqInput, GenerateReqMetaOutput, GenerateReqOutput, LogProbs,
ModelCard, ModelList, ModelPermission, PoolingRequest, PoolingResponse,
TopLogprob, UpdateParamsRequest, UsageInfo)
from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import ReasoningParser, ReasoningParserManager
from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import (ReasoningParser, ReasoningParserManager,
get_streaming_state)
from lmdeploy.serve.openai.tool_parser.tool_parser import ToolParser, ToolParserManager
from lmdeploy.serve.utils.server_utils import validate_json_request
from lmdeploy.tokenizer import DetokenizeState, Tokenizer
Expand Down Expand Up @@ -505,13 +506,10 @@ def create_stream_response_json(index: int,
return response_json

async def completion_stream_generator() -> AsyncGenerator[str, None]:
previous_text = ''
current_text = ''
previous_token_ids = []
current_token_ids = []
delta_token_ids = []
has_parser = VariableInterface.tool_parser is not None or VariableInterface.reasoning_parser is not None
streaming_tools = False
# Shared state for streaming parsers (previous/current text & token ids)
parser_state = get_streaming_state(request) if has_parser else None
async for res in result_generator:
logprobs, usage = None, None
if gen_logprobs and res.logprobs:
Expand All @@ -533,20 +531,13 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
res.finish_reason = 'tool_calls'
else:
delta_message = DeltaMessage(role='assistant', content=res.response)
if has_parser:
current_text = current_text + res.response
current_token_ids = current_token_ids + delta_token_ids
if parser_state is not None:
parser_state.update(res.response, delta_token_ids)
if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
if res.finish_reason == 'stop' and streaming_tools is True:
res.finish_reason = 'tool_calls'
tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=delta_message.content,
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids,
request=request)
delta_text=delta_message.content, delta_token_ids=delta_token_ids, request=request)
if tool_delta is not None:
delta_message.tool_calls = tool_delta.tool_calls
delta_message.content = tool_delta.content
Expand All @@ -557,18 +548,12 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
if VariableInterface.reasoning_parser is not None and enable_thinking is not False:
reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=delta_message.content or '',
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids)
delta_text=delta_message.content or '', delta_token_ids=delta_token_ids, request=request)
if reasoning_delta is not None:
delta_message.reasoning_content = reasoning_delta.reasoning_content
delta_message.content = reasoning_delta.content
if has_parser:
previous_text = current_text
previous_token_ids = current_token_ids
if parser_state is not None:
parser_state.step()
if request.return_token_ids:
delta_message.gen_tokens = delta_token_ids
response_json = create_stream_response_json(index=0,
Expand Down
15 changes: 12 additions & 3 deletions lmdeploy/serve/openai/reasoning_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from .qwen_qwq_reasoning_parser import QwenQwQReasoningParser
from .reasoning_parser import ReasoningParser, ReasoningParserManager
from .qwen_reasoning_parser import QwenQwQReasoningParser
from .reasoning_parser import (ReasoningParser, ReasoningParserManager, StreamingParserState, ThinkingReasoningParser,
get_streaming_state)

__all__ = ['ReasoningParser', 'ReasoningParserManager', 'DeepSeekR1ReasoningParser', 'QwenQwQReasoningParser']
__all__ = [
'ReasoningParser',
'ReasoningParserManager',
'StreamingParserState',
'ThinkingReasoningParser',
'get_streaming_state',
'DeepSeekR1ReasoningParser',
'QwenQwQReasoningParser',
]
145 changes: 15 additions & 130 deletions lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -1,140 +1,25 @@
# Copyright (c) OpenMMLab. All rights reserved.
# modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/reasoning_parsers
import re
from typing import Optional, Sequence, Tuple, Union

from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage

from .reasoning_parser import ReasoningParser, ReasoningParserManager
from .reasoning_parser import ReasoningParserManager, ThinkingReasoningParser


@ReasoningParserManager.register_module(name='deepseek-r1')
class DeepSeekR1ReasoningParser(ReasoningParser):
class DeepSeekR1ReasoningParser(ThinkingReasoningParser):
"""Reasoning parser for DeepSeek R1 model.

The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning text. This parser extracts the reasoning
content from the model output.
Uses <think>...</think> tokens. When the end tag is missing in
non-streaming mode, the entire output is treated as reasoning content
(DeepSeek R1 may omit the start tag).

Ref: https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
"""

start_token = '<think>'
end_token = '</think>'
strip_newlines = False
on_missing_end_tag = 'reasoning'

def __init__(self, tokenizer: object):
super().__init__(tokenizer)
self.think_start_token = '<think>'
self.think_end_token = '</think>'

self.reasoning_regex = re.compile(rf'{self.think_start_token}(.*?){self.think_end_token}', re.DOTALL)

if not self.model_tokenizer:
raise ValueError('The model tokenizer must be passed to the ReasoningParser '
'constructor during construction.')

self.think_start_token_id = self.vocab.get(self.think_start_token)
self.think_end_token_id = self.vocab.get(self.think_end_token)
if (self.think_start_token_id is None or self.think_end_token_id is None):
raise RuntimeError('DeepSeek R1 reasoning parser could not locate think start/end '
'tokens in the tokenizer!')

def extract_reasoning_content_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
**kwargs,
) -> Union[DeltaMessage, None]:
"""Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and
streaming.

Has to be an instance method because it requires state - the current tokens/diffs, but also the information
about what has previously been parsed and extracted (see constructor)
"""
# Skip single special tokens
if len(delta_token_ids) == 1:
if delta_token_ids[0] == self.think_end_token_id:
return DeltaMessage(content='')
elif delta_token_ids[0] == self.think_start_token_id:
return None

# Check if <think> is present in previous or delta.
# Keep compatibility with models that don't generate <think> tokens.
if self.think_start_token_id in previous_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in previous, </think> in delta,
# extract reasoning content
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token):]
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
elif self.think_end_token_id in previous_token_ids:
# <think> in previous, </think> in previous,
return DeltaMessage(content=delta_text)
else:
# <think> in previous, no </think> in previous or delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
elif self.think_start_token_id in delta_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in delta, </think> in delta, extract reasoning content
start_index = delta_text.find(self.think_start_token)
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[start_index + len(self.think_start_token):end_index]
content = delta_text[end_index + len(self.think_end_token):]
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
else:
# <think> in delta, no </think> in delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
else:
# No <think> in previous or delta, also need to check for </think>.
# Because the model may have generated </think> without <think>
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
if self.think_end_token_id in delta_token_ids:
# </think> in delta with more tokens,
# extract reasoning content and content
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token):]
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
elif self.think_end_token_id in previous_token_ids:
# </think> in previous, thinking content ends
return DeltaMessage(content=delta_text)
else:
# no </think> in previous or delta, reasoning content continues
return DeltaMessage(reasoning_content=delta_text)

def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
**kwargs) -> Tuple[Optional[str], Optional[str]]:
"""Extract reasoning content from a complete model-generated string.

Used for non-streaming responses where we have the entire model response
available before sending to the client.

Args:
model_output (str): The model-generated string to extract reasoning content from.
request (ChatCompletionRequest): he request object that was used to generate the model_output.

Returns:
reasoning_content (str | None): The reasoning content.
final_output (str | None): The content.
"""
# DeepSeek R1 doesn't generate <think> now.
# Thus we assume the reasoning content is always at the start.
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
if self.think_end_token not in model_output:
return model_output, None
else:
# Add a start token if it's missing to keep compatibility.
if self.think_start_token not in model_output:
model_output = f'{self.think_start_token}{model_output}'
# Use a regex to find the reasoning content
reasoning_content = self.reasoning_regex.findall(model_output)[0]

end_index = len(f'{self.think_start_token}{reasoning_content}{self.think_end_token}')
final_output = model_output[end_index:]

if len(final_output) == 0:
return reasoning_content, None

return reasoning_content, final_output
if self.start_token_id is None or self.end_token_id is None:
raise RuntimeError('DeepSeek R1 reasoning parser could not locate '
'think start/end tokens in the tokenizer!')
Loading
Loading