deepset-ai · bogdankostic · Mar 9, 2026 · Mar 1, 2026 · Mar 6, 2026 · Mar 7, 2026
@@ -26,7 +26,7 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai>=2.22.0", "ollama>=0.5.0", "pydantic"]
+dependencies = ["haystack-ai>=2.22.0", "ollama>=0.5.0", "pydantic", "tenacity>=8.2.3"]
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ollama#readme"

@@ -22,6 +22,7 @@
 )
 from haystack.utils.callable_serialization import deserialize_callable, serialize_callable
 from pydantic.json_schema import JsonSchemaValue
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 
 from ollama import AsyncClient, ChatResponse, Client
 
@@ -216,6 +217,7 @@ def __init__(
         url: str = "http://localhost:11434",
         generation_kwargs: dict[str, Any] | None = None,
         timeout: int = 120,
+        max_retries: int = 0,
         keep_alive: float | str | None = None,
         streaming_callback: Callable[[StreamingChunk], None] | None = None,
         tools: ToolsType | None = None,
@@ -233,6 +235,8 @@ def __init__(
             [Ollama docs](https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values).
         :param timeout:
             The number of seconds before throwing a timeout error from the Ollama API.
+        :param max_retries:
+            Maximum number of retries to attempt for failed requests.
-        :param max_retries:
-            Maximum number of retries to attempt for failed requests.
+        :param max_retries:
+            Maximum number of retries to attempt for failed requests (HTTP 429, 5xx, connection/timeout errors).
+            Uses exponential backoff between attempts. Set to 0 (default) to disable retries.
-        :param max_retries:
-            Maximum number of retries to attempt for failed requests.
+        :param max_retries:
+            Maximum number of retries to attempt for failed requests (HTTP 429, 5xx, connection/timeout errors).
+            Uses exponential backoff between attempts. Set to 0 (default) to disable retries.
         :param think:
             If True, the model will "think" before producing a response.
             Only [thinking models](https://ollama.com/search?c=thinking) support this feature.
@@ -268,6 +272,7 @@ def __init__(
         self.url = url
         self.generation_kwargs = generation_kwargs or {}
         self.timeout = timeout
+        self.max_retries = max_retries
         self.keep_alive = keep_alive
         self.streaming_callback = streaming_callback
         self.tools = tools  # Store original tools for serialization
@@ -292,6 +297,7 @@ def to_dict(self) -> dict[str, Any]:
             url=self.url,
             generation_kwargs=self.generation_kwargs,
             timeout=self.timeout,
+            max_retries=self.max_retries,
             keep_alive=self.keep_alive,
             streaming_callback=callback_name,
             tools=serialize_tools_or_toolset(self.tools),
@@ -518,16 +524,25 @@ def run(
 
         ollama_messages = [_convert_chatmessage_to_ollama_format(m) for m in messages]
 
-        response = self._client.chat(
-            model=self.model,
-            messages=ollama_messages,
-            tools=ollama_tools,
-            stream=is_stream,  # type: ignore[call-overload]  # Ollama expects Literal[True] or Literal[False], not bool
-            keep_alive=self.keep_alive,
-            options=generation_kwargs,
-            format=self.response_format,
-            think=self.think,
+        @retry(
+            reraise=True,
+            stop=stop_after_attempt(self.max_retries + 1),
+            retry=retry_if_exception_type(Exception),
+            wait=wait_exponential(),
         )
+        def chat_with_retry() -> ChatResponse | Iterator[ChatResponse]:
+            return self._client.chat(
+                model=self.model,
+                messages=ollama_messages,
+                tools=ollama_tools,
+                stream=is_stream,  # type: ignore[call-overload]  # Ollama expects Literal[True] or Literal[False], not bool
+                keep_alive=self.keep_alive,
+                options=generation_kwargs,
+                format=self.response_format,
+                think=self.think,
+            )
+
+        response = chat_with_retry()
 
         if isinstance(response, Iterator):
             return self._handle_streaming_response(response_iter=response, callback=callback)
@@ -579,16 +594,25 @@ async def run_async(
 
         ollama_messages = [_convert_chatmessage_to_ollama_format(m) for m in messages]
 
-        response = await self._async_client.chat(
-            model=self.model,
-            messages=ollama_messages,
-            tools=ollama_tools,
-            stream=is_stream,  # type: ignore[call-overload]  # Ollama expects Literal[True] or Literal[False], not bool
-            keep_alive=self.keep_alive,
-            options=generation_kwargs,
-            format=self.response_format,
-            think=self.think,
+        @retry(
+            reraise=True,
+            stop=stop_after_attempt(self.max_retries + 1),
+            retry=retry_if_exception_type(Exception),
+            wait=wait_exponential(),
         )
+        async def chat_with_retry() -> ChatResponse | AsyncIterator[ChatResponse]:
+            return await self._async_client.chat(
+                model=self.model,
+                messages=ollama_messages,
+                tools=ollama_tools,
+                stream=is_stream,  # type: ignore[call-overload]  # Ollama expects Literal[True] or Literal[False], not bool
+                keep_alive=self.keep_alive,
+                options=generation_kwargs,
+                format=self.response_format,
+                think=self.think,
+            )
+
+        response = await chat_with_retry()
 
         if isinstance(response, AsyncIterator):
             # response is an async iterator for streaming

@@ -518,6 +518,7 @@ def test_init_default(self):
         assert component.url == "http://localhost:11434"
         assert component.generation_kwargs == {}
         assert component.timeout == 120
+        assert component.max_retries == 0
         assert component.streaming_callback is None
         assert component.tools is None
         assert component.keep_alive is None
@@ -529,6 +530,7 @@ def test_init(self, tools):
             url="http://my-custom-endpoint:11434",
             generation_kwargs={"temperature": 0.5},
             timeout=5,
+            max_retries=2,
             keep_alive="10m",
             streaming_callback=print_streaming_chunk,
             tools=tools,
@@ -539,6 +541,7 @@ def test_init(self, tools):
         assert component.url == "http://my-custom-endpoint:11434"
         assert component.generation_kwargs == {"temperature": 0.5}
         assert component.timeout == 5
+        assert component.max_retries == 2
         assert component.keep_alive == "10m"
         assert component.streaming_callback is print_streaming_chunk
         assert component.tools == tools
@@ -603,6 +606,7 @@ def test_to_dict(self):
             "type": "haystack_integrations.components.generators.ollama.chat.chat_generator.OllamaChatGenerator",
             "init_parameters": {
                 "timeout": 120,
+                "max_retries": 0,
                 "model": "llama2",
                 "url": "custom_url",
                 "streaming_callback": "haystack.components.generators.utils.print_streaming_chunk",
@@ -650,6 +654,7 @@ def test_from_dict(self):
             "type": "haystack_integrations.components.generators.ollama.chat.chat_generator.OllamaChatGenerator",
             "init_parameters": {
                 "timeout": 120,
+                "max_retries": 0,
                 "model": "llama2",
                 "url": "custom_url",
                 "keep_alive": "5m",
@@ -689,6 +694,7 @@ def test_from_dict(self):
             "some_test_param": "test-params",
         }
         assert component.timeout == 120
+        assert component.max_retries == 0
         assert component.tools == [tool]
         assert component.response_format == {
             "type": "object",
@@ -790,6 +796,38 @@ def test_run(self, mock_client):
         assert result["replies"][0].text == "Fine. How can I help you today?"
         assert result["replies"][0].role == "assistant"
 
+    @patch("haystack_integrations.components.generators.ollama.chat.chat_generator.Client")
+    def test_run_retries_after_failure(self, mock_client):
+        generator = OllamaChatGenerator(max_retries=1)
+
+        mock_response = ChatResponse(
+            model="qwen3:0.6b",
+            created_at="2023-12-12T14:13:43.416799Z",
+            message={"role": "assistant", "content": "Recovered after retry"},
+            done=True,
+            prompt_eval_count=1,
+            eval_count=2,
+        )
+
+        mock_client_instance = mock_client.return_value
+        mock_client_instance.chat.side_effect = [RuntimeError("temporary failure"), mock_response]
+
+        result = generator.run(messages=[ChatMessage.from_user("Hello!")])
+
+        assert mock_client_instance.chat.call_count == 2
+        assert result["replies"][0].text == "Recovered after retry"
+
+    @patch("haystack_integrations.components.generators.ollama.chat.chat_generator.Client")
+    def test_run_raises_after_retry_exhausted(self, mock_client):
+        generator = OllamaChatGenerator(max_retries=1)
+        mock_client_instance = mock_client.return_value
+        mock_client_instance.chat.side_effect = RuntimeError("persistent failure")
+
+        with pytest.raises(RuntimeError, match="persistent failure"):
+            generator.run(messages=[ChatMessage.from_user("Hello!")])
+
+        assert mock_client_instance.chat.call_count == 2
+
     @patch("haystack_integrations.components.generators.ollama.chat.chat_generator.Client")
     def test_run_streaming(self, mock_client):
         collected_chunks = []