Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions scrapegraphai/nodes/generate_answer_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from langchain_core.prompts import PromptTemplate
from langchain_aws import ChatBedrock
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableParallel
from langchain_openai import ChatOpenAI
from requests.exceptions import Timeout
Expand All @@ -23,7 +22,10 @@
TEMPLATE_NO_CHUNKS,
TEMPLATE_NO_CHUNKS_MD,
)
from ..utils.output_parser import get_pydantic_output_parser
from ..utils.output_parser import (
TolerantJsonOutputParser,
get_pydantic_output_parser,
)
from .base_node import BaseNode


Expand Down Expand Up @@ -148,7 +150,7 @@ def execute(self, state: dict) -> dict:
format_instructions = ""
else:
if not isinstance(self.llm_model, ChatBedrock):
output_parser = JsonOutputParser()
output_parser = TolerantJsonOutputParser()
format_instructions = (
"You must respond with a JSON object. Your response should be formatted as a valid JSON "
"with a 'content' field containing your analysis. For example:\n"
Expand Down
42 changes: 41 additions & 1 deletion scrapegraphai/utils/output_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,53 @@
Functions to retrieve the correct output parser and format instructions for the LLM model.
"""

from typing import Any, Callable, Dict, Type, Union
from typing import Any, Callable, Dict, List, Type, Union

from langchain_core.exceptions import OutputParserException
from langchain_core.outputs import Generation
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel as BaseModelV2
from pydantic.v1 import BaseModel as BaseModelV1


def _strip_doubled_braces(text: str) -> str:
"""Strip one layer of the doubled braces some models echo from the prompt.

The default ``format_instructions`` show the expected shape using LangChain's
escaped braces, e.g. ``{{"content": "..."}}``. Strongly instruction-following
models (GPT-4o, etc.) emit single braces, but some models (notably DeepSeek)
copy the doubled braces verbatim, producing ``{{"content": "..."}}`` which is
not valid JSON. This normalizes that single case and is a no-op otherwise.
"""
stripped = text.strip()
if stripped.startswith("{{") and stripped.endswith("}}"):
return stripped[1:-1]
return text


class TolerantJsonOutputParser(JsonOutputParser):
"""A :class:`JsonOutputParser` tolerant of doubled-brace output.

Behaviour is unchanged on the happy path: valid JSON is parsed by the parent
parser exactly as before. Only when parsing fails AND the output is wrapped in
doubled braces (``{{ ... }}``) does it retry once with a single layer of braces
removed. This keeps providers like DeepSeek working without altering output for
any model that already returns clean JSON.
"""

def parse_result(self, result: List[Generation], *, partial: bool = False) -> Any:
try:
return super().parse_result(result, partial=partial)
except OutputParserException:
text = result[0].text
normalized = _strip_doubled_braces(text)
if normalized != text:
return super().parse_result(
[Generation(text=normalized)], partial=partial
)
raise


def get_structured_output_parser(
schema: Union[Dict[str, Any], Type[BaseModelV1 | BaseModelV2], Type],
) -> Callable:
Expand Down
44 changes: 44 additions & 0 deletions tests/utils/output_parser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Tests for scrapegraphai.utils.output_parser.TolerantJsonOutputParser."""

import pytest

from scrapegraphai.utils.output_parser import (
TolerantJsonOutputParser,
_strip_doubled_braces,
)


def test_strip_doubled_braces_unwraps_single_layer():
assert _strip_doubled_braces('{{"content": "hi"}}') == '{"content": "hi"}'


def test_strip_doubled_braces_is_noop_for_clean_json():
text = '{"content": "hi"}'
assert _strip_doubled_braces(text) == text


def test_strip_doubled_braces_ignores_unbalanced():
text = '{{"content": "hi"}'
assert _strip_doubled_braces(text) == text


def test_tolerant_parser_parses_clean_json_unchanged():
parser = TolerantJsonOutputParser()
assert parser.parse('{"content": "hi"}') == {"content": "hi"}


def test_tolerant_parser_recovers_doubled_braces():
"""Models such as DeepSeek echo the prompt's escaped braces verbatim."""
parser = TolerantJsonOutputParser()
assert parser.parse('{{"content": "hi"}}') == {"content": "hi"}


def test_tolerant_parser_recovers_doubled_braces_with_whitespace():
parser = TolerantJsonOutputParser()
assert parser.parse(' {{"content": "hi"}} ') == {"content": "hi"}


def test_tolerant_parser_still_raises_on_irrecoverable_output():
parser = TolerantJsonOutputParser()
with pytest.raises(Exception):
parser.parse("this is not json at all")
Loading