diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b056a24..1ace900 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,18 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + include: + - python-version: "3.9" + tox: min + - python-version: "3.9" + - python-version: "3.10" + tox: min-x402 + - python-version: "3.10" + - python-version: "3.11" + - python-version: "3.12" + - python-version: "3.13" + - python-version: "3.13" + tox: x402 steps: - uses: actions/checkout@v4 @@ -30,7 +41,7 @@ jobs: python -m pip install tox - name: tox run: | - tox -e py + tox -e ${{ matrix.tox || 'py' }} - name: coverage if: ${{ success() }} uses: codecov/codecov-action@v4.0.1 diff --git a/CHANGES.rst b/CHANGES.rst index 65df607..57f3cd2 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,11 @@ Changes ======= +0.8.0 (unreleased) +------------------ + +* Added :ref:`x402 support `. + 0.7.1 (2025-06-05) ------------------ diff --git a/README.rst b/README.rst index e52e951..e93ded4 100644 --- a/README.rst +++ b/README.rst @@ -35,14 +35,22 @@ Installation pip install zyte-api -.. note:: Python 3.9+ is required. +Or, to use x402_: + +.. _x402: https://www.x402.org/ + +.. code-block:: shell + + pip install zyte-api[x402] + +.. note:: Python 3.9+ is required; Python 3.10+ if using x402. .. install-end Basic usage =========== -.. basic-start +.. basic-key-start Set your API key ---------------- @@ -54,6 +62,9 @@ After you `sign up for a Zyte API account `_. .. key-get-end +.. basic-key-end + +.. basic-start Use the command-line client diff --git a/docs/conf.py b/docs/conf.py index 0c02232..d0223a4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -183,6 +183,11 @@ # A list of files that should not be packed into the epub file. epub_exclude_files = ["search.html"] +# -- Smart quotes ------------------------------------------------------------ +# Disabled so that, in the CLI reference, ``--api-key` and similar options are +# not turned into –api-key in descriptions. +smartquotes = False + # -- Extension configuration ------------------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index 7fe3951..6d43a94 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,7 @@ python-zyte-api :maxdepth: 1 use/key + use/x402 use/cli use/api diff --git a/docs/intro/basic.rst b/docs/intro/basic.rst index 32ff015..5caadbf 100644 --- a/docs/intro/basic.rst +++ b/docs/intro/basic.rst @@ -4,6 +4,14 @@ Basic usage =========== +.. include:: /../README.rst + :start-after: basic-key-start + :end-before: basic-key-end + +To use x402_ instead, see :ref:`x402`. + +.. _x402: https://www.x402.org/ + .. include:: /../README.rst :start-after: basic-start :end-before: basic-end diff --git a/docs/use/api.rst b/docs/use/api.rst index 0d27233..d66ab0f 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -6,8 +6,9 @@ Python client library ===================== -Once you have :ref:`installed python-zyte-api ` and :ref:`configured -your API key `, you can use one of its APIs from Python code: +Once you have :ref:`installed python-zyte-api ` and configured your +:ref:`API key ` or :ref:`Ethereum private key `, you can use one +of its APIs from Python code: - The :ref:`sync API ` can be used to build simple, proof-of-concept or debugging Python scripts. diff --git a/docs/use/cli.rst b/docs/use/cli.rst index abf2479..f650ec1 100644 --- a/docs/use/cli.rst +++ b/docs/use/cli.rst @@ -4,8 +4,9 @@ Command-line client =================== -Once you have :ref:`installed python-zyte-api ` and :ref:`configured -your API key `, you can use the ``zyte-api`` command-line client. +Once you have :ref:`installed python-zyte-api ` and configured your +:ref:`API key ` or :ref:`Ethereum private key `, you can use the +``zyte-api`` command-line client. To use ``zyte-api``, pass an :ref:`input file ` as the first parameter and specify an :ref:`output file ` with ``--output``. diff --git a/docs/use/x402.rst b/docs/use/x402.rst new file mode 100644 index 0000000..e87952c --- /dev/null +++ b/docs/use/x402.rst @@ -0,0 +1,66 @@ +.. _x402: + +==== +x402 +==== + +It is possible to use :ref:`Zyte API ` without a Zyte API account by +using the x402_ protocol to handle payments: + +#. Read the `Zyte Terms of Service`_. By using Zyte API, you are accepting + them. + + .. _Zyte Terms of Service: https://www.zyte.com/terms-policies/terms-of-service/ + +#. During :ref:`installation `, make sure to install the ``x402`` extra. + +#. :ref:`Configure ` the *private* key of your Ethereum_ account to + authorize payments. + + .. _Ethereum: https://ethereum.org/ + + +.. _eth-key: + +Configuring your Ethereum private key +===================================== + +It is recommended to configure your Ethereum private key through an environment +variable, so that it can be picked by both the :ref:`command-line client +` and the :ref:`Python client library `: + +- On Windows’ CMD: + + .. code-block:: shell + + > set ZYTE_API_ETH_KEY=YOUR_ETH_PRIVATE_KEY + +- On macOS and Linux: + + .. code-block:: shell + + $ export ZYTE_API_ETH_KEY=YOUR_ETH_PRIVATE_KEY + +Alternatively, you may pass your Ethereum private key to the clients directly: + +- To pass your Ethereum private key directly to the command-line client, use + the ``--eth-key`` switch: + + .. code-block:: shell + + zyte-api --eth-key YOUR_ETH_PRIVATE_KEY … + +- To pass your Ethereum private key directly to the Python client classes, + use the ``eth_key`` parameter when creating a client object: + + .. code-block:: python + + from zyte_api import ZyteAPI + + client = ZyteAPI(eth_key="YOUR_ETH_PRIVATE_KEY") + + .. code-block:: python + + from zyte_api import AsyncZyteAPI + + client = AsyncZyteAPI(eth_key="YOUR_ETH_PRIVATE_KEY") diff --git a/pyproject.toml b/pyproject.toml index 34ebbc9..86666f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,6 +154,9 @@ ignore = [ "zyte_api/errors.py" = ["UP007"] "zyte_api/stats.py" = ["UP007"] +[tool.ruff.lint.flake8-pytest-style] +parametrize-values-type = "tuple" + [tool.ruff.lint.flake8-type-checking] runtime-evaluated-decorators = ["attr.s"] diff --git a/setup.py b/setup.py index a80bb81..aa2f258 100755 --- a/setup.py +++ b/setup.py @@ -16,14 +16,20 @@ "console_scripts": ["zyte-api=zyte_api.__main__:_main"], }, install_requires=[ - "aiohttp >= 3.8.0", - "attrs", - "brotli", - "runstats", - "tenacity", - "tqdm", - "w3lib >= 2.1.1", + "aiohttp>=3.8.0", + "attrs>=20.1.0", + "brotli>=0.5.2", + "runstats>=0.0.1", + "tenacity>=8.2.0", + "tqdm>=4.16.0", + "w3lib>=2.1.1", ], + extras_require={ + "x402": [ + "eth-account>=0.13.7", + "x402>=0.1.1", + ] + }, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", diff --git a/tests/mockserver.py b/tests/mockserver.py index 014f85d..cf3fb11 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -4,6 +4,7 @@ import sys import time from base64 import b64encode +from collections import defaultdict from importlib import import_module from subprocess import PIPE, Popen from typing import Any @@ -14,6 +15,11 @@ from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET, Site +SCREENSHOT = ( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAACklEQVR4nGMAAQAABQABDQott" + "AAAAABJRU5ErkJggg==" +) + # https://github.com/scrapy/scrapy/blob/02b97f98e74a994ad3e4d74e7ed55207e508a576/tests/mockserver.py#L27C1-L33C19 def getarg(request, name, default=None, type=None): @@ -62,6 +68,28 @@ def _delayedRender(self, request): request.finish() +RESPONSE_402 = { + "x402Version": 1, + "accepts": [ + { + "scheme": "exact", + "network": "base-sepolia", + "maxAmountRequired": "1000", + "resource": "https://api.zyte.com/v1/extract", + "description": "", + "mimeType": "", + "payTo": "0xFACEdD967ea0592bbb9410fA4877Df9AeB628CB7", + "maxTimeoutSeconds": 130, + "asset": "0xFACEbD53842c5426634e7929541eC2318f3dCF7e", + "extra": {"name": "USDC", "version": "2"}, + } + ], + "error": "Use basic auth or x402", +} + +WORKFLOWS: defaultdict[str, dict[str, Any]] = defaultdict(dict) + + class DefaultResource(Resource): request_count = 0 @@ -69,8 +97,6 @@ def getChild(self, path, request): return self def render_POST(self, request): - request_data = json.loads(request.content.read()) - request.responseHeaders.setRawHeaders( b"Content-Type", [b"application/json"], @@ -80,12 +106,17 @@ def render_POST(self, request): [b"abcd1234"], ) + request_data = json.loads(request.content.read()) + url = request_data["url"] domain = urlparse(url).netloc if domain == "e429.example": request.setResponseCode(429) response_data = {"status": 429, "type": "/limits/over-user-limit"} return json.dumps(response_data).encode() + if domain == "e404.example": + request.setResponseCode(404) + return b"" if domain == "e500.example": request.setResponseCode(500) return "" @@ -119,6 +150,70 @@ def render_POST(self, request): request.setResponseCode(500) return b'["foo"]' + auth_header = request.getHeader("Authorization") + payment_header = request.getHeader("X-Payment") + if not auth_header and not payment_header: + request.setResponseCode(402) + return json.dumps(RESPONSE_402).encode() + + echo_data = request_data.get("echoData") + if echo_data: + session_data = WORKFLOWS.setdefault(echo_data, {}) + if echo_data in {"402-payment-retry", "402-payment-retry-2"}: + assert request.getHeader("X-Payment") + # Return 402 on the first request, then 200 on the second + if not session_data: + session_data["payment_attempts"] = 1 + request.setResponseCode(402) + return json.dumps(RESPONSE_402).encode() + elif echo_data == "402-payment-retry-exceeded": + assert request.getHeader("X-Payment") + # Return 402 on the first 2 requests, then 200 on the third + # (the client will give up after 2 attempts, so there will be no + # third in practice) + if not session_data: + session_data["payment_attempts"] = 1 + session_data["payment"] = request.getHeader("X-Payment") + request.setResponseCode(402) + return json.dumps(RESPONSE_402).encode() + if session_data["payment_attempts"] == 1: + session_data["payment_attempts"] = 2 + # Make sure the client refreshed the payment header + assert session_data["payment"] != request.getHeader("X-Payment") + request.setResponseCode(402) + return json.dumps(RESPONSE_402).encode() + elif echo_data == "402-no-payment-retry": + assert not request.getHeader("X-Payment") + # Return 402 on the first request, then 200 on the second + if not session_data: + session_data["payment_attempts"] = 1 + request.setResponseCode(402) + return json.dumps(RESPONSE_402).encode() + elif echo_data == "402-no-payment-retry-exceeded": + assert not request.getHeader("X-Payment") + # Return 402 on the first 2 requests, then 200 on the third + # (the client will give up after 2 attempts, so there will be no + # third in practice) + if not session_data: + session_data["payment_attempts"] = 1 + request.setResponseCode(402) + return json.dumps(RESPONSE_402).encode() + if session_data["payment_attempts"] == 1: + session_data["payment_attempts"] = 2 + request.setResponseCode(402) + return json.dumps(RESPONSE_402).encode() + elif echo_data == "402-long-error": + request.setResponseCode(402) + response_data = { + **RESPONSE_402, + "error": ( + "This is a long error message that exceeds the 32 " + "character limit for the error type prefix. It should " + "not be parsed as an error type." + ), + } + return json.dumps(response_data).encode() + response_data: dict[str, Any] = { "url": url, } @@ -127,9 +222,12 @@ def render_POST(self, request): if "httpResponseBody" in request_data: body = b64encode(html.encode()).decode() response_data["httpResponseBody"] = body - else: + if "browserHtml" in request_data: assert "browserHtml" in request_data response_data["browserHtml"] = html + if "screenshot" in request_data: + assert "screenshot" in request_data + response_data["screenshot"] = SCREENSHOT return json.dumps(response_data).encode() diff --git a/tests/test_apikey.py b/tests/test_apikey.py new file mode 100644 index 0000000..46cbbc9 --- /dev/null +++ b/tests/test_apikey.py @@ -0,0 +1,15 @@ +import pytest + +from zyte_api.apikey import NoApiKey, get_apikey + + +def test_get_apikey(monkeypatch): + assert get_apikey("a") == "a" + with pytest.raises(NoApiKey): + get_apikey() + with pytest.raises(NoApiKey): + get_apikey(None) + monkeypatch.setenv("ZYTE_API_KEY", "b") + assert get_apikey("a") == "a" + assert get_apikey() == "b" + assert get_apikey(None) == "b" diff --git a/tests/test_async.py b/tests/test_async.py index c9f4c59..f54eff9 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -12,14 +12,14 @@ @pytest.mark.parametrize( "client_cls", - [ + ( AsyncZyteAPI, AsyncClient, - ], + ), ) @pytest.mark.parametrize( ("user_agent", "expected"), - [ + ( ( None, USER_AGENT, @@ -28,7 +28,7 @@ f"scrapy-zyte-api/0.11.1 {USER_AGENT}", f"scrapy-zyte-api/0.11.1 {USER_AGENT}", ), - ], + ), ) def test_user_agent(client_cls, user_agent, expected): client = client_cls(api_key="123", api_url="http:\\test", user_agent=user_agent) @@ -37,10 +37,10 @@ def test_user_agent(client_cls, user_agent, expected): @pytest.mark.parametrize( "client_cls", - [ + ( AsyncZyteAPI, AsyncClient, - ], + ), ) def test_api_key(client_cls): client_cls(api_key="a") @@ -50,10 +50,10 @@ def test_api_key(client_cls): @pytest.mark.parametrize( ("client_cls", "get_method"), - [ + ( (AsyncZyteAPI, "get"), (AsyncClient, "request_raw"), - ], + ), ) @pytest.mark.asyncio async def test_get(client_cls, get_method, mockserver): @@ -70,10 +70,10 @@ async def test_get(client_cls, get_method, mockserver): @pytest.mark.parametrize( ("client_cls", "get_method"), - [ + ( (AsyncZyteAPI, "get"), (AsyncClient, "request_raw"), - ], + ), ) @pytest.mark.asyncio async def test_get_request_error(client_cls, get_method, mockserver): @@ -94,10 +94,10 @@ async def test_get_request_error(client_cls, get_method, mockserver): @pytest.mark.parametrize( ("client_cls", "get_method"), - [ + ( (AsyncZyteAPI, "get"), (AsyncClient, "request_raw"), - ], + ), ) @pytest.mark.asyncio async def test_get_request_error_empty_body(client_cls, get_method, mockserver): @@ -113,10 +113,10 @@ async def test_get_request_error_empty_body(client_cls, get_method, mockserver): @pytest.mark.parametrize( ("client_cls", "get_method"), - [ + ( (AsyncZyteAPI, "get"), (AsyncClient, "request_raw"), - ], + ), ) @pytest.mark.asyncio async def test_get_request_error_non_json(client_cls, get_method, mockserver): @@ -132,10 +132,10 @@ async def test_get_request_error_non_json(client_cls, get_method, mockserver): @pytest.mark.parametrize( ("client_cls", "get_method"), - [ + ( (AsyncZyteAPI, "get"), (AsyncClient, "request_raw"), - ], + ), ) @pytest.mark.asyncio async def test_get_request_error_unexpected_json(client_cls, get_method, mockserver): @@ -151,10 +151,10 @@ async def test_get_request_error_unexpected_json(client_cls, get_method, mockser @pytest.mark.parametrize( ("client_cls", "iter_method"), - [ + ( (AsyncZyteAPI, "iter"), (AsyncClient, "request_parallel_as_completed"), - ], + ), ) @pytest.mark.asyncio async def test_iter(client_cls, iter_method, mockserver): @@ -192,10 +192,10 @@ async def test_iter(client_cls, iter_method, mockserver): @pytest.mark.parametrize( ("client_cls", "get_method", "iter_method"), - [ + ( (AsyncZyteAPI, "get", "iter"), (AsyncClient, "request_raw", "request_parallel_as_completed"), - ], + ), ) @pytest.mark.asyncio async def test_semaphore(client_cls, get_method, iter_method, mockserver): diff --git a/tests/test_auth.py b/tests/test_auth.py new file mode 100644 index 0000000..86838ff --- /dev/null +++ b/tests/test_auth.py @@ -0,0 +1,130 @@ +from os import environ +from subprocess import run +from tempfile import NamedTemporaryFile + +import pytest + +from zyte_api import AsyncZyteAPI + +from .test_x402 import HAS_X402 +from .test_x402 import KEY as ETH_KEY + +ETH_KEY_2 = ETH_KEY[-1] + ETH_KEY[:-1] +assert ETH_KEY_2 != ETH_KEY + + +def run_zyte_api(args, env, mockserver): + with NamedTemporaryFile("w") as url_list: + url_list.write("https://a.example\n") + url_list.flush() + return run( + [ + "python", + "-m", + "zyte_api", + "--api-url", + mockserver.urljoin("/"), + url_list.name, + *args, + ], + capture_output=True, + check=False, + env={**environ, **env}, + ) + + +@pytest.mark.parametrize( + ("scenario", "expected"), + ( + ({}, {"stderr": "NoApiKey"}), + ({"args": ["--api-key", "a"]}, {}), + ({"env": {"ZYTE_API_KEY": "a"}}, {}), + ( + {"args": ["--eth-key", ETH_KEY]}, + {} if HAS_X402 else {"stderr": "ModuleNotFoundError"}, + ), + ( + {"env": {"ZYTE_API_ETH_KEY": ETH_KEY}}, + {} if HAS_X402 else {"stderr": "ModuleNotFoundError"}, + ), + ), +) +def test(scenario, expected, mockserver): + result = run_zyte_api( + scenario.get("args", []), + scenario.get("env", {}), + mockserver, + ) + if "stderr" in expected: + assert expected["stderr"].encode() in result.stderr + assert result.returncode == 1 + else: + assert result.returncode == 0 + + +@pytest.mark.parametrize( + ("scenario", "expected"), + ( + ( + { + "kwargs": {"api_key": "a", "eth_key": ETH_KEY}, + "env": { + "ZYTE_API_KEY": "b", + "ZYTE_API_ETH_KEY": ETH_KEY_2, + }, + }, + {"key_type": "zyte", "key": "a"}, + ), + ( + { + "kwargs": {"eth_key": ETH_KEY}, + "env": { + "ZYTE_API_KEY": "b", + "ZYTE_API_ETH_KEY": ETH_KEY_2, + }, + }, + {"key_type": "eth", "key": ETH_KEY}, + ), + ( + { + "env": { + "ZYTE_API_KEY": "b", + "ZYTE_API_ETH_KEY": ETH_KEY_2, + }, + }, + {"key_type": "zyte", "key": "b"}, + ), + ( + { + "env": { + "ZYTE_API_ETH_KEY": ETH_KEY_2, + }, + }, + {"key_type": "eth", "key": ETH_KEY_2}, + ), + ), +) +def test_precedence(scenario, expected, monkeypatch): + for key, value in scenario.get("env", {}).items(): + monkeypatch.setenv(key, value) + if expected["key_type"] == "eth" and not HAS_X402: + with pytest.raises(ImportError): + AsyncZyteAPI(**scenario.get("kwargs", {})) + return + client = AsyncZyteAPI(**scenario.get("kwargs", {})) + assert client.auth.type == expected["key_type"] + assert client.auth.key == expected["key"] + assert ( + client.api_url == "https://api-x402.zyte.com/v1/" + if expected["key_type"] == "eth" + else "https://api.zyte.com/v1/" + ) + if expected["key_type"] == "zyte": + with pytest.warns(DeprecationWarning, match="api_key property is deprecated"): + assert client.api_key == expected["key"] + else: + with pytest.raises( + NotImplementedError, + match="api_key is not available when using an Ethereum private key", + ): + client.api_key # noqa: B018 diff --git a/tests/test_main.py b/tests/test_main.py index cacf11d..a30cbdc 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -11,7 +11,17 @@ from zyte_api.aio.errors import RequestError -class MockRequestError(Exception): +class MockRequestError(RequestError): + def __init__(self, *args, **kwargs): + super().__init__( + *args, + query={}, + response_content=b"", + request_info=None, + history=None, + **kwargs, + ) + @property def parsed(self): return Mock( @@ -52,7 +62,7 @@ async def fake_exception(value=True): @pytest.mark.parametrize( ("queries", "expected_response", "store_errors", "exception"), ( - [ + ( # test if it stores the error(s) also by adding flag ( [ @@ -79,7 +89,7 @@ async def fake_exception(value=True): False, fake_exception, ), - ] + ) ), ) @pytest.mark.asyncio diff --git a/tests/test_retry.py b/tests/test_retry.py index a41e15c..8bf8f7f 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -36,11 +36,11 @@ class OutlierException(RuntimeError): @pytest.mark.parametrize( ("value", "exception"), - [ + ( (UNSET, OutlierException), (True, OutlierException), (False, RequestError), - ], + ), ) @pytest.mark.asyncio async def test_get_handle_retries(value, exception, mockserver): @@ -64,13 +64,13 @@ def broken_stop(_): @pytest.mark.parametrize( ("retry_factory", "status", "waiter"), - [ + ( (RetryFactory, 429, "throttling"), (RetryFactory, 520, "download_error"), (AggressiveRetryFactory, 429, "throttling"), (AggressiveRetryFactory, 500, "undocumented_error"), (AggressiveRetryFactory, 520, "download_error"), - ], + ), ) @pytest.mark.asyncio async def test_retry_wait(retry_factory, status, waiter, mockserver): @@ -93,10 +93,10 @@ class CustomRetryFactory(retry_factory): @pytest.mark.parametrize( "retry_factory", - [ + ( RetryFactory, AggressiveRetryFactory, - ], + ), ) @pytest.mark.asyncio async def test_retry_wait_network_error(retry_factory): @@ -150,7 +150,7 @@ def __call__(self, number, add=0): @pytest.mark.parametrize( ("retrying", "outcomes", "exhausted"), - [ + ( # Shared behaviors of all retry policies *( (retrying, outcomes, exhausted) @@ -395,7 +395,7 @@ def __call__(self, number, add=0): ), ) ), - ], + ), ) @pytest.mark.asyncio @patch("time.monotonic") diff --git a/tests/test_utils.py b/tests/test_utils.py index 76eab08..01e59ed 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -16,7 +16,7 @@ async def test_create_session_custom_connector(): @pytest.mark.parametrize( ("file_name", "first_line", "expected"), - [ + ( ( "", "https://toscrape.com", @@ -62,7 +62,7 @@ async def test_create_session_custom_connector(): '{"url": "https://toscrape.com"}', "jl", ), - ], + ), ) def test_guess_intype(file_name, first_line, expected): assert _guess_intype(file_name, [first_line]) == expected @@ -70,7 +70,7 @@ def test_guess_intype(file_name, first_line, expected): @pytest.mark.parametrize( ("input", "output"), - [ + ( # Unsafe URLs in the url field are modified, while left untouched on # other fields. ( @@ -102,7 +102,7 @@ def test_guess_intype(file_name, first_line, expected): ), # NOTE: We use w3lib.url.safe_url_string for escaping. Tests covering # the URL escaping logic exist upstream. - ], + ), ) def test_process_query(input, output): assert _process_query(input) == output diff --git a/tests/test_x402.py b/tests/test_x402.py new file mode 100644 index 0000000..bc89a72 --- /dev/null +++ b/tests/test_x402.py @@ -0,0 +1,578 @@ +import contextlib +import importlib.util +from os import environ +from unittest import mock + +import pytest + +from zyte_api import AsyncZyteAPI +from zyte_api._errors import RequestError + +from .mockserver import SCREENSHOT + +BODY = "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==" +HAS_X402 = importlib.util.find_spec("x402") is not None +HTML = "Hello

World!

" +KEY = "c85ef7d79691fe79573b1a7064c5232332f53bb1b44a08f1a737f57a68a4706e" + + +def test_eth_key_param(): + if HAS_X402: + client = AsyncZyteAPI(eth_key=KEY) + assert client.auth.key == KEY + assert client.auth.type == "eth" + assert client.api_url == "https://api-x402.zyte.com/v1/" + else: + with pytest.raises(ImportError, match="No module named 'eth_account'"): + AsyncZyteAPI(eth_key=KEY) + + +@mock.patch.dict(environ, {"ZYTE_API_ETH_KEY": KEY}) +def test_eth_key_env_var(): + if HAS_X402: + AsyncZyteAPI() + else: + with pytest.raises(ImportError, match="No module named 'eth_account'"): + AsyncZyteAPI() + + +def test_eth_key_short(): + if HAS_X402: + with pytest.raises(ValueError, match="must be exactly 32 bytes long"): + AsyncZyteAPI(eth_key="a") + else: + with pytest.raises(ImportError, match="No module named 'eth_account'"): + AsyncZyteAPI(eth_key="a") + + +@contextlib.contextmanager +def reset_x402_cache(): + from zyte_api import _x402 + + try: + yield _x402.CACHE + finally: + _x402.CACHE = {} + + +@pytest.mark.skipif(not HAS_X402, reason="x402 not installed") +@pytest.mark.asyncio +@pytest.mark.parametrize( + "scenario", + ( + # Identical + { + "i1": {"url": "https://a.example", "httpResponseBody": True}, + "o1": {"url": "https://a.example", "httpResponseBody": BODY}, + "i2": {"url": "https://a.example", "httpResponseBody": True}, + "o2": {"url": "https://a.example", "httpResponseBody": BODY}, + "cache": "hit", + }, + # Extra headers + { + "i1": {"url": "https://a.example", "httpResponseBody": True}, + "o1": {"url": "https://a.example", "httpResponseBody": BODY}, + "i2": { + "url": "https://a.example", + "httpResponseBody": True, + "customHttpRequestHeaders": [{"name": "foo", "value": "bar"}], + }, + "o2": {"url": "https://a.example", "httpResponseBody": BODY}, + "cache": "hit", + }, + # Different domain + { + "i1": {"url": "https://a.example", "httpResponseBody": True}, + "o1": {"url": "https://a.example", "httpResponseBody": BODY}, + "i2": {"url": "https://b.example", "httpResponseBody": True}, + "o2": {"url": "https://b.example", "httpResponseBody": BODY}, + "cache": "miss", + }, + # Different request type (HTTP vs browser) + { + "i1": {"url": "https://a.example", "httpResponseBody": True}, + "o1": {"url": "https://a.example", "httpResponseBody": BODY}, + "i2": {"url": "https://a.example", "browserHtml": True}, + "o2": {"url": "https://a.example", "browserHtml": HTML}, + "cache": "miss", + }, + # Screenshot + { + "i1": {"url": "https://a.example", "browserHtml": True}, + "o1": {"url": "https://a.example", "browserHtml": HTML}, + "i2": {"url": "https://a.example", "screenshot": True}, + "o2": {"url": "https://a.example", "screenshot": SCREENSHOT}, + "cache": "miss", + }, + # Actions: Empty actions count as no actions + { + "i1": {"url": "https://a.example", "browserHtml": True}, + "o1": {"url": "https://a.example", "browserHtml": HTML}, + "i2": {"url": "https://a.example", "browserHtml": True, "actions": []}, + "o2": {"url": "https://a.example", "browserHtml": HTML}, + "cache": "hit", + }, + # Actions: Actions vs no actions + { + "i1": {"url": "https://a.example", "browserHtml": True}, + "o1": {"url": "https://a.example", "browserHtml": HTML}, + "i2": { + "url": "https://a.example", + "browserHtml": True, + "actions": [{"action": "click", "selector": "button#submit"}], + }, + "o2": {"url": "https://a.example", "browserHtml": HTML}, + "cache": "miss", + }, + # Actions: Different action count does not prevent cache hit + { + "i1": { + "url": "https://a.example", + "browserHtml": True, + "actions": [{"action": "click", "selector": "button#submit"}], + }, + "o1": {"url": "https://a.example", "browserHtml": HTML}, + "i2": { + "url": "https://a.example", + "browserHtml": True, + "actions": [ + {"action": "click", "selector": "button#submit"}, + {"action": "scrollBottom"}, + ], + }, + "o2": {"url": "https://a.example", "browserHtml": HTML}, + "cache": "hit", + }, + # Network capture: Empty network capture count as no network capture + { + "i1": {"url": "https://a.example", "browserHtml": True}, + "o1": {"url": "https://a.example", "browserHtml": HTML}, + "i2": { + "url": "https://a.example", + "browserHtml": True, + "networkCapture": [], + }, + "o2": {"url": "https://a.example", "browserHtml": HTML}, + "cache": "hit", + }, + # Network capture: Network capture vs no network capture + { + "i1": {"url": "https://a.example", "browserHtml": True}, + "o1": {"url": "https://a.example", "browserHtml": HTML}, + "i2": { + "url": "https://a.example", + "browserHtml": True, + "networkCapture": [ + { + "filterType": "url", + "httpResponseBody": True, + "value": "/api/", + "matchType": "contains", + } + ], + }, + "o2": {"url": "https://a.example", "browserHtml": HTML}, + "cache": "miss", + }, + # Network capture: Different network capture count does not prevent + # cache hit + { + "i1": { + "url": "https://a.example", + "browserHtml": True, + "networkCapture": [ + { + "filterType": "url", + "httpResponseBody": True, + "value": "/api/", + "matchType": "contains", + } + ], + }, + "o1": {"url": "https://a.example", "browserHtml": HTML}, + "i2": { + "url": "https://a.example", + "browserHtml": True, + "networkCapture": [ + { + "filterType": "url", + "httpResponseBody": True, + "value": "/api/", + "matchType": "contains", + }, + { + "filterType": "url", + "httpResponseBody": True, + "value": "/other/", + "matchType": "contains", + }, + ], + }, + "o2": {"url": "https://a.example", "browserHtml": HTML}, + "cache": "hit", + }, + # Extraction: serp does not affect cost + { + "i1": {"url": "https://a.example", "httpResponseBody": True}, + "o1": {"url": "https://a.example", "httpResponseBody": BODY}, + "i2": {"url": "https://a.example", "serp": True}, + "o2": {"url": "https://a.example"}, + "cache": "hit", + }, + # Extraction: all other extraction types do affect cost + { + "i1": {"url": "https://a.example", "httpResponseBody": True}, + "o1": {"url": "https://a.example", "httpResponseBody": BODY}, + "i2": {"url": "https://a.example", "product": True}, + "o2": {"url": "https://a.example"}, + "cache": "miss", + }, + # Extraction: customAttributes, same method + { + "i1": { + "url": "https://a.example", + "customAttributes": { + "summary": { + "type": "string", + "description": "A two sentence article summary", + }, + }, + }, + "o1": {"url": "https://a.example"}, + "i2": { + "url": "https://a.example", + "customAttributes": { + "summary": { + "type": "string", + "description": "A two sentence article summary", + }, + }, + "customAttributesOptions": { + "method": "generate", + }, + }, + "o2": {"url": "https://a.example"}, + "cache": "hit", + }, + # Extraction: customAttributes, same method + { + "i1": { + "url": "https://a.example", + "customAttributes": { + "summary": { + "type": "string", + "description": "A two sentence article summary", + }, + }, + }, + "o1": {"url": "https://a.example"}, + "i2": { + "url": "https://a.example", + "customAttributes": { + "summary": { + "type": "string", + "description": "A two sentence article summary", + }, + }, + "customAttributesOptions": { + "method": "extract", + }, + }, + "o2": {"url": "https://a.example"}, + "cache": "miss", + }, + # Extraction: the number of custom attributes does not affect cost + { + "i1": { + "url": "https://a.example", + "customAttributes": { + "summary": { + "type": "string", + "description": "A two sentence article summary", + }, + }, + }, + "o1": {"url": "https://a.example"}, + "i2": { + "url": "https://a.example", + "customAttributes": { + "summary": { + "type": "string", + "description": "A two sentence article summary", + }, + "article_sentiment": { + "type": "string", + "enum": ["positive", "negative", "neutral"], + }, + }, + }, + "o2": {"url": "https://a.example"}, + "cache": "hit", + }, + # Extraction: extractFrom is assumed to be browser by default + { + "i1": { + "url": "https://a.example", + "product": True, + }, + "o1": {"url": "https://a.example"}, + "i2": { + "url": "https://a.example", + "product": True, + "productOptions": { + "extractFrom": "browserHtml", + }, + }, + "o2": {"url": "https://a.example"}, + "cache": "hit", + }, + # Extraction: extractFrom affects cost + { + "i1": { + "url": "https://a.example", + "product": True, + "productOptions": { + "extractFrom": "httpResponseBody", + }, + }, + "o1": {"url": "https://a.example"}, + "i2": { + "url": "https://a.example", + "product": True, + "productOptions": { + "extractFrom": "browserHtml", + }, + }, + "o2": {"url": "https://a.example"}, + "cache": "miss", + }, + ), +) +async def test_cache(scenario, mockserver): + """Requests that are expected to have the same cost (or cost modifiers) as + a preceding request should hit the cache. + + https://docs.zyte.com/zyte-api/pricing.html#request-costs + """ + client = AsyncZyteAPI(eth_key=KEY, api_url=mockserver.urljoin("/")) + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + + # Request 1 + actual_result = await client.get(scenario["i1"]) + assert actual_result == scenario["o1"] + assert len(cache) == 1 + assert client.agg_stats.n_402_req == 1 + + # Request 2 + actual_result = await client.get(scenario["i2"]) + assert actual_result == scenario["o2"] + assert len(cache) == 2 if scenario["cache"] == "miss" else 1 + assert client.agg_stats.n_402_req == len(cache) + + +@pytest.mark.skipif(not HAS_X402, reason="x402 not installed") +@pytest.mark.asyncio +@mock.patch("zyte_api._x402.MINIMIZE_REQUESTS", False) +async def test_no_cache(mockserver): + client = AsyncZyteAPI(eth_key=KEY, api_url=mockserver.urljoin("/")) + input = {"url": "https://a.example", "httpResponseBody": True} + output = { + "url": "https://a.example", + "httpResponseBody": BODY, + } + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + + # Initial request + actual_result = await client.get(input) + assert actual_result == output + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 1 + + # Identical request + actual_result = await client.get(input) + assert actual_result == output + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 2 + + # Request refresh + input = { + "url": "https://a.example", + "httpResponseBody": True, + "echoData": "402-payment-retry-2", + } + actual_result = await client.get(input) + assert actual_result == output + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 3 + + +@pytest.mark.skipif(not HAS_X402, reason="x402 not installed") +@pytest.mark.asyncio +async def test_4xx(mockserver): + """An unexpected status code lower than 500 raises RequestError + immediately.""" + client = AsyncZyteAPI(eth_key=KEY, api_url=mockserver.urljoin("/")) + input = {"url": "https://e404.example", "httpResponseBody": True} + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + with pytest.raises(RequestError): + await client.get(input) + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 1 + + +@pytest.mark.skipif(not HAS_X402, reason="x402 not installed") +@pytest.mark.asyncio +async def test_5xx(mockserver): + """An unexpected status code ≥ 500 gets retried once.""" + client = AsyncZyteAPI(eth_key=KEY, api_url=mockserver.urljoin("/")) + input = {"url": "https://e500.example", "httpResponseBody": True} + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + with pytest.raises(RequestError): + await client.get(input) + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 2 + + +@pytest.mark.skipif(not HAS_X402, reason="x402 not installed") +@pytest.mark.asyncio +async def test_payment_retry(mockserver): + client = AsyncZyteAPI(eth_key=KEY, api_url=mockserver.urljoin("/")) + input = { + "url": "https://a.example", + "httpResponseBody": True, + "echoData": "402-payment-retry", + } + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + data = await client.get(input) + assert len(cache) == 1 + + assert data == {"httpResponseBody": BODY, "url": "https://a.example"} + assert client.agg_stats.n_success == 1 + assert client.agg_stats.n_fatal_errors == 0 + assert client.agg_stats.n_attempts == 2 + assert client.agg_stats.n_errors == 1 + assert client.agg_stats.n_402_req == 1 + assert client.agg_stats.status_codes == {402: 1, 200: 1} + assert client.agg_stats.exception_types == {} + assert client.agg_stats.api_error_types == {"/x402/use-basic-auth-or-x402": 1} + + +@pytest.mark.skipif(not HAS_X402, reason="x402 not installed") +@pytest.mark.asyncio +async def test_payment_retry_exceeded(mockserver): + client = AsyncZyteAPI(eth_key=KEY, api_url=mockserver.urljoin("/")) + input = { + "url": "https://a.example", + "httpResponseBody": True, + "echoData": "402-payment-retry-exceeded", + } + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + with pytest.raises(RequestError): + await client.get(input) + assert len(cache) == 1 + + assert client.agg_stats.n_success == 0 + assert client.agg_stats.n_fatal_errors == 1 + assert client.agg_stats.n_attempts == 2 + assert client.agg_stats.n_errors == 2 + assert client.agg_stats.n_402_req == 1 + assert client.agg_stats.status_codes == {402: 2} + assert client.agg_stats.exception_types == {} + assert client.agg_stats.api_error_types == {"/x402/use-basic-auth-or-x402": 2} + + +@pytest.mark.asyncio +async def test_no_payment_retry(mockserver): + """An HTTP 402 response received out of the context of the x402 protocol, + as a response to a regular request using basic auth.""" + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + input = { + "url": "https://a.example", + "httpResponseBody": True, + "echoData": "402-no-payment-retry", + } + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + data = await client.get(input) + assert len(cache) == 0 + + assert data == {"httpResponseBody": BODY, "url": "https://a.example"} + assert client.agg_stats.n_success == 1 + assert client.agg_stats.n_fatal_errors == 0 + assert client.agg_stats.n_attempts == 2 + assert client.agg_stats.n_errors == 1 + assert client.agg_stats.n_402_req == 0 + assert client.agg_stats.status_codes == {402: 1, 200: 1} + assert client.agg_stats.exception_types == {} + assert client.agg_stats.api_error_types == {"/x402/use-basic-auth-or-x402": 1} + + +@pytest.mark.asyncio +async def test_no_payment_retry_exceeded(mockserver): + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + input = { + "url": "https://a.example", + "httpResponseBody": True, + "echoData": "402-no-payment-retry-exceeded", + } + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + with pytest.raises(RequestError): + await client.get(input) + assert len(cache) == 0 + + assert client.agg_stats.n_success == 0 + assert client.agg_stats.n_fatal_errors == 1 + assert client.agg_stats.n_attempts == 2 + assert client.agg_stats.n_errors == 2 + assert client.agg_stats.n_402_req == 0 + assert client.agg_stats.status_codes == {402: 2} + assert client.agg_stats.exception_types == {} + assert client.agg_stats.api_error_types == {"/x402/use-basic-auth-or-x402": 2} + + +@pytest.mark.asyncio +async def test_long_error(mockserver): + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + input = { + "url": "https://a.example", + "httpResponseBody": True, + "echoData": "402-long-error", + } + + with reset_x402_cache() as cache: + assert len(cache) == 0 + assert client.agg_stats.n_402_req == 0 + with pytest.raises(RequestError): + await client.get(input) + assert len(cache) == 0 + + assert client.agg_stats.n_success == 0 + assert client.agg_stats.n_fatal_errors == 1 + assert client.agg_stats.n_attempts == 2 + assert client.agg_stats.n_errors == 2 + assert client.agg_stats.n_402_req == 0 + assert client.agg_stats.status_codes == {402: 2} + assert client.agg_stats.exception_types == {} + assert client.agg_stats.api_error_types == {None: 2} diff --git a/tox.ini b/tox.ini index 33663b7..dce2440 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py39,py310,py311,py312,py313,mypy,docs,twine +envlist = pre-commit,mypy,min,min-x402,py39,py310,py311,py312,py313,x402,docs,twine [testenv] deps = @@ -10,13 +10,37 @@ deps = pytest-twisted responses twisted - commands = py.test \ --cov-report=term-missing --cov-report=html --cov-report=xml --cov=zyte_api \ --doctest-modules \ {posargs:zyte_api tests} +[testenv:x402] +extras = x402 + +[min] +deps = + {[testenv]deps} + aiohttp==3.8.0 + attrs==20.1.0 + brotli==0.5.2 + runstats==0.0.1 + tenacity==8.2.0 + tqdm==4.16.0 + w3lib==2.1.1 + +[testenv:min] +basepython = python3.9 +deps = {[min]deps} + +[testenv:min-x402] +basepython = python3.10 +deps = + {[min]deps} + eth_account==0.13.7 + x402==0.1.1 + [testenv:mypy] deps = mypy==1.12.0 @@ -42,8 +66,8 @@ commands = pre-commit run --all-files --show-diff-on-failure [testenv:twine] deps = - twine==5.1.1 - build==1.2.2 + twine==6.1.0 + build==1.2.2.post1 commands = python -m build --sdist twine check dist/* diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index eda97e6..fab06e3 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -2,7 +2,7 @@ Python client libraries and command line utilities for Zyte API """ -from ._async import AsyncZyteAPI +from ._async import AsyncZyteAPI, AuthInfo from ._errors import RequestError from ._retry import ( AggressiveRetryFactory, diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index e3bd8cc..060115a 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -1,5 +1,7 @@ """Basic command-line interface for Zyte API.""" +from __future__ import annotations + import argparse import asyncio import json @@ -12,6 +14,7 @@ from tenacity import retry_if_exception from zyte_api._async import AsyncZyteAPI +from zyte_api._errors import RequestError from zyte_api._retry import RetryFactory, _is_throttling_error from zyte_api._utils import create_session from zyte_api.constants import API_URL @@ -33,10 +36,11 @@ async def run( *, n_conn, stop_on_errors=_UNSET, - api_url, + api_url: str | None, api_key=None, retry_errors=True, store_errors=None, + eth_key=None, ): if stop_on_errors is not _UNSET: warn( @@ -54,8 +58,13 @@ def write_output(content): pbar.update() retrying = None if retry_errors else DontRetryErrorsFactory().build() + auth_kwargs = {} + if api_key: + auth_kwargs["api_key"] = api_key + elif eth_key: + auth_kwargs["eth_key"] = eth_key client = AsyncZyteAPI( - n_conn=n_conn, api_key=api_key, api_url=api_url, retrying=retrying + n_conn=n_conn, api_url=api_url, retrying=retrying, **auth_kwargs ) async with create_session(connection_pool_size=n_conn) as session: result_iter = client.iter( @@ -71,7 +80,7 @@ def write_output(content): try: result = await fut except Exception as e: - if store_errors: + if store_errors and isinstance(e, RequestError): write_output(e.parsed.response_body.decode()) if stop_on_errors: @@ -154,12 +163,38 @@ def _get_argument_parser(program_name="zyte-api"): default=20, help=("Number of concurrent connections to use (default: %(default)s)."), ) - p.add_argument( + group = p.add_mutually_exclusive_group(required=False) + group.add_argument( "--api-key", - help="Zyte API key.", + help=( + "Zyte API key.\n" + "\n" + "If not specified, it is read from the ZYTE_API_KEY environment " + "variable." + "\n" + "Cannot be combined with --eth-key." + ), + ) + group.add_argument( + "--eth-key", + help=( + "Ethereum private key, as a hexadecimal string.\n" + "\n" + "If not specified, it is read from the ZYTE_API_ETH_KEY " + "environment variable." + "\n" + "Cannot be combined with --api-key." + ), ) p.add_argument( - "--api-url", help="Zyte API endpoint (default: %(default)s).", default=API_URL + "--api-url", + help=( + f"Zyte API endpoint (default: {API_URL}).\n" + f"\n" + f"Using an Ethereum private key, e.g. through --eth-key or " + f"through the ZYTE_API_ETH_KEY environment variable, changes the " + f"default API URL to https://api-x402.zyte.com/v1/.\n" + ), ) p.add_argument( "--loglevel", @@ -218,6 +253,7 @@ def _main(program_name="zyte-api"): n_conn=args.n_conn, api_url=args.api_url, api_key=args.api_key, + eth_key=args.eth_key, retry_errors=not args.dont_retry_errors, store_errors=args.store_errors, ) diff --git a/zyte_api/_async.py b/zyte_api/_async.py index a5890ed..b19df17 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -4,16 +4,21 @@ import time from asyncio import Future from functools import partial +from os import environ from typing import TYPE_CHECKING, Any +from warnings import warn import aiohttp from tenacity import AsyncRetrying +from zyte_api._x402 import _x402Handler +from zyte_api.apikey import NoApiKey + from ._errors import RequestError from ._retry import zyte_api_retrying from ._utils import _AIO_API_TIMEOUT, create_session -from .apikey import get_apikey from .constants import API_URL +from .constants import ENV_VARIABLE as API_KEY_ENV_VAR from .stats import AggStats, ResponseStats from .utils import USER_AGENT, _process_query @@ -77,6 +82,25 @@ def iter( ) +class AuthInfo: + def __init__(self, *, _auth): + self._auth = _auth + + @property + def key(self) -> str: + if isinstance(self._auth, str): + return self._auth + assert isinstance(self._auth, _x402Handler) + return self._auth.client.account.key.hex() + + @property + def type(self) -> str: + if isinstance(self._auth, str): + return "zyte" + assert isinstance(self._auth, _x402Handler) + return "eth" + + class AsyncZyteAPI: """:ref:`Asynchronous Zyte API client `. @@ -86,24 +110,65 @@ class AsyncZyteAPI: def __init__( self, *, - api_key=None, - api_url=API_URL, - n_conn=15, + api_key: str | None = None, + api_url: str | None = None, + n_conn: int = 15, retrying: AsyncRetrying | None = None, user_agent: str | None = None, + eth_key: str | None = None, ): if retrying is not None and not isinstance(retrying, AsyncRetrying): raise ValueError( "The retrying parameter, if defined, must be an instance of " "AsyncRetrying." ) - self.api_key = get_apikey(api_key) - self.api_url = api_url + self.n_conn = n_conn self.agg_stats = AggStats() self.retrying = retrying or zyte_api_retrying self.user_agent = user_agent or USER_AGENT self._semaphore = asyncio.Semaphore(n_conn) + self._auth: str | _x402Handler + self.auth: AuthInfo + self.api_url: str + self._load_auth(api_key, eth_key, api_url) + + def _load_auth(self, api_key: str | None, eth_key: str | None, api_url: str | None): + if api_key: + self._auth = api_key + elif eth_key: + self._auth = _x402Handler(eth_key, self._semaphore, self.agg_stats) + elif api_key := environ.get(API_KEY_ENV_VAR): + self._auth = api_key + elif eth_key := environ.get("ZYTE_API_ETH_KEY"): + self._auth = _x402Handler(eth_key, self._semaphore, self.agg_stats) + else: + raise NoApiKey( + "You must provide either a Zyte API key or an Ethereum " + "private key. For the latter, you must also install " + "zyte-api as zyte-api[x402]." + ) + self.auth = AuthInfo(_auth=self._auth) + self.api_url = ( + api_url + if api_url is not None + else "https://api-x402.zyte.com/v1/" + if self.auth.type == "eth" + else API_URL + ) + + @property + def api_key(self) -> str: + if isinstance(self._auth, str): + warn( + "The api_key property is deprecated, use auth.key instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._auth + raise NotImplementedError( + "api_key is not available when using an Ethereum private key, use auth.key instead." + ) async def get( self, @@ -117,9 +182,25 @@ async def get( """Asynchronous equivalent to :meth:`ZyteAPI.get`.""" retrying = retrying or self.retrying post = _post_func(session) - auth = aiohttp.BasicAuth(self.api_key) + + url = self.api_url + endpoint + query = _process_query(query) headers = {"User-Agent": self.user_agent, "Accept-Encoding": "br"} + auth_kwargs = {} + if isinstance(self._auth, str): + auth_kwargs["auth"] = aiohttp.BasicAuth(self._auth) + else: + x402_headers = await self._auth.get_headers(url, query, headers, post) + headers.update(x402_headers) + + post_kwargs = { + "url": url, + "json": query, + "headers": headers, + **auth_kwargs, + } + response_stats = [] start_global = time.perf_counter() @@ -127,17 +208,15 @@ async def request(): stats = ResponseStats.create(start_global) self.agg_stats.n_attempts += 1 - safe_query = _process_query(query) - post_kwargs = { - "url": self.api_url + endpoint, - "json": safe_query, - "auth": auth, - "headers": headers, - } - try: async with self._semaphore, post(**post_kwargs) as resp: stats.record_connected(resp.status, self.agg_stats) + if ( + resp.status == 402 + and isinstance(self._auth, _x402Handler) + and "X-Payment" in post_kwargs["headers"] + ): + self._auth.refresh_post_kwargs(post_kwargs, await resp.json()) if resp.status >= 400: content = await resp.read() resp.release() @@ -151,7 +230,7 @@ async def request(): message=resp.reason, headers=resp.headers, response_content=content, - query=safe_query, + query=query, ) response = await resp.json() diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index a6a82e1..430d380 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -19,6 +19,7 @@ retry_if_exception, wait_chain, wait_fixed, + wait_none, wait_random, wait_random_exponential, ) @@ -159,6 +160,10 @@ def _undocumented_error(exc: BaseException) -> bool: ) +def _402_error(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status == 402 + + def _deprecated(message: str, callable: Callable) -> Callable: def wrapper(factory, retry_state: RetryCallState): warn(message, DeprecationWarning, stacklevel=3) @@ -200,6 +205,7 @@ class CustomRetryFactory(RetryFactory): | retry_if_exception(_is_network_error) | retry_if_exception(_download_error) | retry_if_exception(_undocumented_error) + | retry_if_exception(_402_error) ) # throttling throttling_wait = wait_chain( @@ -244,6 +250,9 @@ class CustomRetryFactory(RetryFactory): undocumented_error_stop = stop_on_count(2) undocumented_error_wait = network_error_wait + x402_error_stop = stop_on_count(2) + x402_error_wait = wait_none() + def wait(self, retry_state: RetryCallState) -> float: assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() @@ -254,6 +263,8 @@ def wait(self, retry_state: RetryCallState) -> float: return self.network_error_wait(retry_state=retry_state) if _undocumented_error(exc): return self.undocumented_error_wait(retry_state=retry_state) + if _402_error(exc): + return self.x402_error_wait(retry_state=retry_state) assert _download_error(exc) # See retry_condition return self.download_error_wait(retry_state=retry_state) @@ -267,6 +278,8 @@ def stop(self, retry_state: RetryCallState) -> bool: return self.network_error_stop(retry_state) if _undocumented_error(exc): return self.undocumented_error_stop(retry_state) + if _402_error(exc): + return self.x402_error_stop(retry_state) assert _download_error(exc) # See retry_condition return self.download_error_stop(retry_state) diff --git a/zyte_api/_sync.py b/zyte_api/_sync.py index 8f3fdb1..5a318e4 100644 --- a/zyte_api/_sync.py +++ b/zyte_api/_sync.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING from ._async import AsyncZyteAPI -from .constants import API_URL if TYPE_CHECKING: from collections.abc import Generator @@ -83,7 +82,14 @@ class ZyteAPI: *api_key* is your Zyte API key. If not specified, it is read from the ``ZYTE_API_KEY`` environment variable. See :ref:`api-key`. - *api_url* is the Zyte API base URL. + Alternatively, you can set an Ethereum private key through *eth_key* to use + Ethereum for payments. If not specified, it is read from the + ``ZYTE_API_ETH_KEY`` environment variable. See :ref:`x402`. + + *api_url* is the Zyte API base URL. If set to ``None``, it defaults to + ``"https://api.zyte.com/v1/"``. If using an Ethereum private key, e.g. + through *eth_key* or through the ``ZYTE_API_ETH_KEY`` environment + variable, ``None`` results in ``"https://api-x402.zyte.com/v1/"`` instead. *n_conn* is the maximum number of concurrent requests to use. See :ref:`api-optimize`. @@ -101,11 +107,12 @@ class ZyteAPI: def __init__( self, *, - api_key=None, - api_url=API_URL, - n_conn=15, + api_key: str | None = None, + api_url: str | None = None, + n_conn: int = 15, retrying: AsyncRetrying | None = None, user_agent: str | None = None, + eth_key: str | None = None, ): self._async_client = AsyncZyteAPI( api_key=api_key, @@ -113,6 +120,7 @@ def __init__( n_conn=n_conn, retrying=retrying, user_agent=user_agent, + eth_key=eth_key, ) def get( diff --git a/zyte_api/_x402.py b/zyte_api/_x402.py new file mode 100644 index 0000000..b7d506c --- /dev/null +++ b/zyte_api/_x402.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import json +from hashlib import md5 +from os import environ +from typing import TYPE_CHECKING, Any +from urllib.parse import urlparse + +from tenacity import stop_after_attempt + +from zyte_api._errors import RequestError +from zyte_api._retry import RetryFactory + +if TYPE_CHECKING: + from asyncio import Semaphore + from collections.abc import Callable + from contextlib import AbstractAsyncContextManager + + from aiohttp import ClientResponse + + from zyte_api.stats import AggStats + +CACHE: dict[bytes, tuple[Any, str]] = {} +EXTRACT_KEYS = { + "article", + "articleList", + "articleNavigation", + "forumThread", + "jobPosting", + "jobPostingNavigation", + "product", + "productList", + "productNavigation", + "serp", +} +MINIMIZE_REQUESTS = environ.get("ZYTE_API_ETH_MINIMIZE_REQUESTS") != "false" + + +def get_extract_from(query: dict[str, Any], data_type: str) -> str | Any: + options = query.get(f"{data_type}Options", {}) + default_extract_from = "httpResponseBody" if data_type == "serp" else None + return options.get("extractFrom", default_extract_from) + + +def get_extract_froms(query: dict[str, Any]) -> set[str]: + result = set() + for key in EXTRACT_KEYS: + if not query.get(key, False): + continue + result.add(get_extract_from(query, key)) + return result + + +def may_use_browser(query: dict[str, Any]) -> bool: + """Return ``False`` if *query* indicates with certainty that browser + rendering will not be used, or ``True`` otherwise.""" + for key in ("browserHtml", "screenshot"): + if query.get(key): + return True + extract_froms = get_extract_froms(query) + if "browserHtml" in extract_froms: + return True + if "httpResponseBody" in extract_froms: + return False + return not query.get("httpResponseBody") + + +def get_max_cost_hash(query: dict[str, Any]) -> bytes: + """Returns a hash based on *query* that should be the same for queries + whose estimate costs are the same. + + For open-ended costs, like actions, network capture or custom attributes, + we assume that Zyte API will not report a different cost based on e.g. the + number of actions or their parameters, or similar details of network + capture or custom attributes. + + See also: https://docs.zyte.com/zyte-api/pricing.html#request-costs + """ + data = { + "domain": urlparse(query["url"]).netloc, + "type": "browser" if may_use_browser(query) else "http", + } + for key in ( + *(k for k in EXTRACT_KEYS if k != "serp"), # serp does not affect cost + "actions", + "networkCapture", + "screenshot", + ): + if query.get(key): + data[key] = True + if query.get("customAttributes"): + data["customAttributesOptions.method"] = query.get( + "customAttributesOptions", {} + ).get("method", "generate") + return md5(json.dumps(data, sort_keys=True).encode()).digest() # noqa: S324 + + +class X402RetryFactory(RetryFactory): + # Disable ban response retries. + download_error_stop = stop_after_attempt(1) # type: ignore[assignment] + + +X402_RETRYING = X402RetryFactory().build() + + +class _x402Handler: + def __init__( + self, + eth_key: str, + semaphore: Semaphore, + stats: AggStats, + ): + from eth_account import Account + from x402.clients import x402Client + from x402.types import x402PaymentRequiredResponse + + account = Account.from_key(eth_key) + self.client = x402Client(account=account) + self.semaphore = semaphore + self.stats = stats + self.x402PaymentRequiredResponse = x402PaymentRequiredResponse + + async def get_headers( + self, + url: str, + query: dict[str, Any], + headers: dict[str, str], + post_fn: Callable[..., AbstractAsyncContextManager[ClientResponse]], + ) -> dict[str, str]: + requirement_data = await self.get_requirement_data(url, query, headers, post_fn) + return self.get_headers_from_requirement_data(requirement_data) + + def get_headers_from_requirement_data( + self, requirement_data: tuple[Any, str] + ) -> dict[str, str]: + payment_header = self.client.create_payment_header(*requirement_data) + return { + "Access-Control-Expose-Headers": "X-Payment-Response", + "X-Payment": payment_header, + } + + async def get_requirement_data( + self, + url: str, + query: dict[str, Any], + headers: dict[str, str], + post_fn: Callable[..., AbstractAsyncContextManager[ClientResponse]], + ) -> tuple[Any, str]: + if not MINIMIZE_REQUESTS: + return await self.fetch_requirements(url, query, headers, post_fn) + max_cost_hash = get_max_cost_hash(query) + if max_cost_hash not in CACHE: + CACHE[max_cost_hash] = await self.fetch_requirements( + url, query, headers, post_fn + ) + return CACHE[max_cost_hash] + + async def fetch_requirements( + self, + url: str, + query: dict[str, Any], + headers: dict[str, str], + post_fn: Callable[..., AbstractAsyncContextManager[ClientResponse]], + ) -> tuple[Any, str]: + post_kwargs = {"url": url, "json": query, "headers": headers} + + async def request(): + self.stats.n_402_req += 1 + async with self.semaphore, post_fn(**post_kwargs) as response: + if response.status == 402: + return await response.json() + content = await response.read() + response.release() + raise RequestError( + request_info=response.request_info, + history=response.history, + status=response.status, + message=response.reason, + headers=response.headers, + response_content=content, + query=query, + ) + + request = X402_RETRYING.wraps(request) + data = await request() + return self.parse_requirements(data) + + def parse_requirements(self, data: dict[str, Any]) -> tuple[Any, str]: + payment_response = self.x402PaymentRequiredResponse(**data) + requirements = self.client.select_payment_requirements(payment_response.accepts) + version = payment_response.x402_version + return requirements, version + + def refresh_post_kwargs( + self, + post_kwargs: dict[str, Any], + response_data: dict[str, Any], + ) -> None: + requirement_data = self.parse_requirements(response_data) + if MINIMIZE_REQUESTS: + max_cost_hash = get_max_cost_hash(post_kwargs["json"]) + CACHE[max_cost_hash] = requirement_data + headers = self.get_headers_from_requirement_data(requirement_data) + post_kwargs["headers"] = {**post_kwargs["headers"], **headers} diff --git a/zyte_api/errors.py b/zyte_api/errors.py index 0248e2c..ff7d19d 100644 --- a/zyte_api/errors.py +++ b/zyte_api/errors.py @@ -6,6 +6,10 @@ import attr +def _to_kebab_case(s: str) -> str: + return s.strip().replace(" ", "-").replace("_", "-").lower() + + @attr.s(auto_attribs=True) class ParsedError: """Parsed error response body from Zyte API.""" @@ -45,4 +49,15 @@ def from_body(cls, response_body: bytes) -> ParsedError: def type(self) -> Optional[str]: """ID of the error type, e.g. ``"/limits/over-user-limit"`` or ``"/download/temporary-error"``.""" - return (self.data or {}).get("type", None) + data = self.data or {} + if "type" in data: + return data["type"] + if "error" in data and isinstance(data["error"], str): # HTTP 402 + try: + prefix, _ = data["error"].split(":", 1) + except ValueError: + prefix = data["error"] + if len(prefix) > 32: + return None + return f"/x402/{_to_kebab_case(prefix)}" + return None diff --git a/zyte_api/stats.py b/zyte_api/stats.py index c293d3c..e52edfb 100644 --- a/zyte_api/stats.py +++ b/zyte_api/stats.py @@ -37,6 +37,7 @@ def __init__(self): ) self.n_429 = 0 # number of 429 (throttling) responses self.n_errors = 0 # number of errors, including errors which were retried + self.n_402_req = 0 # requests for a 402 (payment required) response self.status_codes = Counter() self.exception_types = Counter()