Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ async def arun(
url: str,
config: CrawlerRunConfig = None,
**kwargs,
) -> RunManyReturn:
) -> CrawlResultContainer:
"""
Runs the crawler for a single source: URL (web, local file, or raw HTML).
Expand Down
194 changes: 194 additions & 0 deletions tests/test_type_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""
Tests for type annotation correctness across the crawl4ai public API.

Catches issues like #1898 where arun() was annotated as returning
RunManyReturn (includes AsyncGenerator) but actually always returns
CrawlResultContainer.

These tests use static analysis (inspect + typing introspection) to verify
return type annotations match actual behavior — without needing pyright/mypy
installed in CI.
"""
import asyncio
import collections.abc
import inspect
import typing
from typing import Union, AsyncGenerator, get_type_hints
import pytest


# ── Return type annotation tests ─────────────────────────────────────────

class TestReturnTypeAnnotations:
"""Verify that public method return types are correctly annotated."""

def _get_return_annotation(self, cls, method_name):
"""Get the return type annotation for a method."""
method = getattr(cls, method_name)
hints = typing.get_type_hints(method)
return hints.get("return")

def _annotation_contains(self, annotation, target_type):
"""Check if a type annotation contains a specific type (including in unions).

Handles both typing.AsyncGenerator and collections.abc.AsyncGenerator,
since typing generics have __origin__ pointing to the abc class.
"""
if annotation is target_type:
return True
origin = getattr(annotation, "__origin__", None)
if origin is target_type:
return True
if origin is Union:
return any(
self._annotation_contains(arg, target_type)
for arg in annotation.__args__
)
return False

def test_arun_does_not_include_async_generator(self):
"""
#1898: arun() always returns CrawlResultContainer, never AsyncGenerator.
The return type should NOT include AsyncGenerator.
"""
from crawl4ai import AsyncWebCrawler
ret = self._get_return_annotation(AsyncWebCrawler, "arun")
assert ret is not None, "arun() has no return type annotation"

# Check both typing.AsyncGenerator and collections.abc.AsyncGenerator
# (typing generics have __origin__ = collections.abc.AsyncGenerator)
has_async_gen = (
self._annotation_contains(ret, collections.abc.AsyncGenerator)
or self._annotation_contains(ret, AsyncGenerator)
)
assert not has_async_gen, (
f"arun() return type includes AsyncGenerator but arun() never yields. "
f"Current annotation: {ret}. "
f"Should be CrawlResultContainer or CrawlResult."
)

def test_arun_many_return_type_exists(self):
"""arun_many() should have a return type annotation."""
from crawl4ai import AsyncWebCrawler
ret = self._get_return_annotation(AsyncWebCrawler, "arun_many")
assert ret is not None, "arun_many() has no return type annotation"

def test_aprocess_html_return_type_exists(self):
"""aprocess_html() should have a return type annotation."""
from crawl4ai import AsyncWebCrawler
ret = self._get_return_annotation(AsyncWebCrawler, "aprocess_html")
assert ret is not None, "aprocess_html() has no return type annotation"


# ── Parameter annotation tests ───────────────────────────────────────────

class TestParameterAnnotations:
"""Verify that public method parameters have type annotations."""

def _get_untyped_params(self, cls, method_name, ignore=("self", "kwargs")):
"""Find parameters without type annotations."""
method = getattr(cls, method_name)
sig = inspect.signature(method)
untyped = []
for name, param in sig.parameters.items():
if name in ignore:
continue
if param.annotation is inspect.Parameter.empty:
untyped.append(name)
return untyped

def test_arun_params_annotated(self):
"""arun() public params should have type annotations."""
from crawl4ai import AsyncWebCrawler
untyped = self._get_untyped_params(AsyncWebCrawler, "arun")
# Allow **kwargs to be untyped, but core params should be typed
assert "url" not in untyped, "arun(url=...) missing type annotation"
assert "config" not in untyped, "arun(config=...) missing type annotation"

def test_arun_many_params_annotated(self):
"""arun_many() public params should have type annotations."""
from crawl4ai import AsyncWebCrawler
untyped = self._get_untyped_params(AsyncWebCrawler, "arun_many")
assert "urls" not in untyped, "arun_many(urls=...) missing type annotation"


# ── Consistency tests ────────────────────────────────────────────────────

class TestAnnotationConsistency:
"""Verify that annotations match actual runtime behavior."""

def test_arun_actually_returns_what_annotation_says(self):
"""
Whatever arun()'s return annotation says, verify that
CrawlResultContainer is compatible with it.
"""
from crawl4ai import AsyncWebCrawler
from crawl4ai.models import CrawlResultContainer

hints = typing.get_type_hints(AsyncWebCrawler.arun)
ret = hints.get("return")
if ret is None:
pytest.skip("No return annotation to check")

# Get all types in the annotation
origin = getattr(ret, "__origin__", None)
if origin is Union:
allowed_types = [
getattr(arg, "__origin__", arg) for arg in ret.__args__
]
else:
allowed_types = [getattr(ret, "__origin__", ret)]

# CrawlResultContainer should be in the allowed types
is_compatible = any(
t is CrawlResultContainer
or (isinstance(t, type) and issubclass(CrawlResultContainer, t))
for t in allowed_types
)
assert is_compatible, (
f"arun() returns CrawlResultContainer at runtime, but annotation {ret} "
f"doesn't include it. Allowed types: {allowed_types}"
)

def test_config_classes_init_params_match_attributes(self):
"""
CrawlerRunConfig.__init__ params should become attributes.
Catches cases where a param is added to __init__ but not stored.
"""
from crawl4ai import CrawlerRunConfig

sig = inspect.signature(CrawlerRunConfig.__init__)
config = CrawlerRunConfig()

missing = []
for name in sig.parameters:
if name == "self":
continue
if not hasattr(config, name):
missing.append(name)

assert not missing, (
f"CrawlerRunConfig.__init__ has params that aren't stored as attributes: {missing}"
)


# ── Public API export tests ──────────────────────────────────────────────

class TestPublicAPIExports:
"""Verify that types referenced in public annotations are importable."""

def test_crawl_result_importable(self):
from crawl4ai import CrawlResult
assert CrawlResult is not None

def test_crawl_result_container_importable(self):
from crawl4ai.models import CrawlResultContainer
assert CrawlResultContainer is not None

def test_run_many_return_importable(self):
from crawl4ai.models import RunManyReturn
assert RunManyReturn is not None

def test_markdown_generation_result_importable(self):
from crawl4ai import MarkdownGenerationResult
assert MarkdownGenerationResult is not None