From 07c37a83f196f69a62ea00a9459f2b53b1cce482 Mon Sep 17 00:00:00 2001 From: hafezparast Date: Sun, 29 Mar 2026 00:41:13 +0800 Subject: [PATCH] feat: add "markdown" field type to JsonElementExtractionStrategy (#1708) Add "markdown" as a new field type in the extraction schema type pipeline. When used, the selected element's HTML is converted to markdown via CustomHTML2Text, preserving formatting (bold, links, lists) without returning raw HTML tags. Works across all strategy subclasses (CSS, lxml, XPath) and in pipelines (e.g., ["markdown", "regex"]). Closes #1708 Co-Authored-By: Claude Opus 4.6 (1M context) --- crawl4ai/extraction_strategy.py | 10 + tests/test_markdown_field_type_1708.py | 287 +++++++++++++++++++++++++ 2 files changed, 297 insertions(+) create mode 100644 tests/test_markdown_field_type_1708.py diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index a31560160..ba2a195b6 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -32,6 +32,7 @@ from .models import TokenUsage from .model_loader import * # noqa: F403 +from .html2text import CustomHTML2Text from .model_loader import ( get_device, load_HF_embedding_model, @@ -1273,6 +1274,15 @@ def _extract_single_field(self, element, field): value = self._get_element_attribute(value, field["attribute"]) elif step == "html": value = self._get_element_html(value) + elif step == "markdown": + # Convert element (or HTML string) to markdown + if not isinstance(value, str): + value = self._get_element_html(value) + if isinstance(value, str): + converter = CustomHTML2Text() + value = converter.handle(value).strip() + else: + value = None elif step == "regex": pattern = field.get("pattern") if pattern: diff --git a/tests/test_markdown_field_type_1708.py b/tests/test_markdown_field_type_1708.py new file mode 100644 index 000000000..302ca4b94 --- /dev/null +++ b/tests/test_markdown_field_type_1708.py @@ -0,0 +1,287 @@ +""" +Tests for #1708: "markdown" field type in JsonElementExtractionStrategy + +Verifies that the "markdown" type converts element HTML to markdown, +works in pipelines, across all strategy subclasses, and in end-to-end extraction. +""" +import json +import pytest +from bs4 import BeautifulSoup + + +# ── Rich HTML fixtures ─────────────────────────────────────────────────── + +RICH_HTML = """ +
+

Widget Pro

+

Best seller - Our most popular widget with advanced features.

+ + $29.99 +
+""" + +MULTI_PRODUCT_HTML = """ + +
+

Alpha

+

Bold text and italic.

+
+
+

Beta

+

Plain text with a link.

+
+ +""" + + +# ── JsonCssExtractionStrategy ──────────────────────────────────────────── + +class TestMarkdownTypeCss: + + @pytest.fixture + def strategy(self): + from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + schema = {"name": "test", "baseSelector": "div.product", "fields": []} + return JsonCssExtractionStrategy(schema) + + @pytest.fixture + def element(self): + soup = BeautifulSoup(RICH_HTML, "html.parser") + return soup.find("div", class_="product") + + def test_markdown_type_converts_html_to_markdown(self, strategy, element): + """Basic: 'markdown' type should convert element to markdown string.""" + field = {"selector": "p.desc", "type": "markdown"} + result = strategy._extract_single_field(element, field) + assert isinstance(result, str) + assert "**Best seller**" in result or "Best seller" in result + assert "advanced features" in result + + def test_markdown_preserves_links(self, strategy, element): + """Markdown should preserve link URLs.""" + field = {"selector": "ul.features", "type": "markdown"} + result = strategy._extract_single_field(element, field) + assert isinstance(result, str) + assert "/docs" in result + assert "Well documented" in result + + def test_markdown_preserves_list_structure(self, strategy, element): + """Markdown should convert