diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 3c5ca7ed..6f4d2a1f 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -9,7 +9,6 @@ from langchain_core.prompts import PromptTemplate from langchain_aws import ChatBedrock from langchain_ollama import ChatOllama -from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from langchain_openai import ChatOpenAI from requests.exceptions import Timeout @@ -23,7 +22,10 @@ TEMPLATE_NO_CHUNKS, TEMPLATE_NO_CHUNKS_MD, ) -from ..utils.output_parser import get_pydantic_output_parser +from ..utils.output_parser import ( + TolerantJsonOutputParser, + get_pydantic_output_parser, +) from .base_node import BaseNode @@ -148,7 +150,7 @@ def execute(self, state: dict) -> dict: format_instructions = "" else: if not isinstance(self.llm_model, ChatBedrock): - output_parser = JsonOutputParser() + output_parser = TolerantJsonOutputParser() format_instructions = ( "You must respond with a JSON object. Your response should be formatted as a valid JSON " "with a 'content' field containing your analysis. For example:\n" diff --git a/scrapegraphai/utils/output_parser.py b/scrapegraphai/utils/output_parser.py index a9d9ba31..db216e37 100644 --- a/scrapegraphai/utils/output_parser.py +++ b/scrapegraphai/utils/output_parser.py @@ -2,13 +2,53 @@ Functions to retrieve the correct output parser and format instructions for the LLM model. """ -from typing import Any, Callable, Dict, Type, Union +from typing import Any, Callable, Dict, List, Type, Union +from langchain_core.exceptions import OutputParserException +from langchain_core.outputs import Generation from langchain_core.output_parsers import JsonOutputParser from pydantic import BaseModel as BaseModelV2 from pydantic.v1 import BaseModel as BaseModelV1 +def _strip_doubled_braces(text: str) -> str: + """Strip one layer of the doubled braces some models echo from the prompt. + + The default ``format_instructions`` show the expected shape using LangChain's + escaped braces, e.g. ``{{"content": "..."}}``. Strongly instruction-following + models (GPT-4o, etc.) emit single braces, but some models (notably DeepSeek) + copy the doubled braces verbatim, producing ``{{"content": "..."}}`` which is + not valid JSON. This normalizes that single case and is a no-op otherwise. + """ + stripped = text.strip() + if stripped.startswith("{{") and stripped.endswith("}}"): + return stripped[1:-1] + return text + + +class TolerantJsonOutputParser(JsonOutputParser): + """A :class:`JsonOutputParser` tolerant of doubled-brace output. + + Behaviour is unchanged on the happy path: valid JSON is parsed by the parent + parser exactly as before. Only when parsing fails AND the output is wrapped in + doubled braces (``{{ ... }}``) does it retry once with a single layer of braces + removed. This keeps providers like DeepSeek working without altering output for + any model that already returns clean JSON. + """ + + def parse_result(self, result: List[Generation], *, partial: bool = False) -> Any: + try: + return super().parse_result(result, partial=partial) + except OutputParserException: + text = result[0].text + normalized = _strip_doubled_braces(text) + if normalized != text: + return super().parse_result( + [Generation(text=normalized)], partial=partial + ) + raise + + def get_structured_output_parser( schema: Union[Dict[str, Any], Type[BaseModelV1 | BaseModelV2], Type], ) -> Callable: diff --git a/tests/utils/output_parser_test.py b/tests/utils/output_parser_test.py new file mode 100644 index 00000000..0c8186f0 --- /dev/null +++ b/tests/utils/output_parser_test.py @@ -0,0 +1,44 @@ +"""Tests for scrapegraphai.utils.output_parser.TolerantJsonOutputParser.""" + +import pytest + +from scrapegraphai.utils.output_parser import ( + TolerantJsonOutputParser, + _strip_doubled_braces, +) + + +def test_strip_doubled_braces_unwraps_single_layer(): + assert _strip_doubled_braces('{{"content": "hi"}}') == '{"content": "hi"}' + + +def test_strip_doubled_braces_is_noop_for_clean_json(): + text = '{"content": "hi"}' + assert _strip_doubled_braces(text) == text + + +def test_strip_doubled_braces_ignores_unbalanced(): + text = '{{"content": "hi"}' + assert _strip_doubled_braces(text) == text + + +def test_tolerant_parser_parses_clean_json_unchanged(): + parser = TolerantJsonOutputParser() + assert parser.parse('{"content": "hi"}') == {"content": "hi"} + + +def test_tolerant_parser_recovers_doubled_braces(): + """Models such as DeepSeek echo the prompt's escaped braces verbatim.""" + parser = TolerantJsonOutputParser() + assert parser.parse('{{"content": "hi"}}') == {"content": "hi"} + + +def test_tolerant_parser_recovers_doubled_braces_with_whitespace(): + parser = TolerantJsonOutputParser() + assert parser.parse(' {{"content": "hi"}} ') == {"content": "hi"} + + +def test_tolerant_parser_still_raises_on_irrecoverable_output(): + parser = TolerantJsonOutputParser() + with pytest.raises(Exception): + parser.parse("this is not json at all")