AbanteAI · mentatai · Jul 14, 2024 · Jul 14, 2024 · Jul 14, 2024 · jakethekoenig
diff --git a/spice/spice.py b/spice/spice.py
@@ -230,6 +230,9 @@ def __init__(
         logging_dir: Optional[Path | str] = None,
         logging_callback: Optional[Callable[[SpiceResponse, str, str], None]] = None,
         default_temperature: Optional[float] = None,
+        max_retries: int = 0,  # Add this line
+        base_delay: float = 1.0,  # Add this line
+        max_delay: float = 32.0,  # Add this line
     ):
         """
         Creates a new Spice client.
@@ -268,6 +271,11 @@ def __init__(
         self._default_embeddings_model = embeddings_model
         self._default_temperature = default_temperature
 
+        # Initialize retry configuration parameters
+        self.max_retries = max_retries
+        self.base_delay = base_delay
+        self.max_delay = max_delay
+
         # TODO: Should we validate model aliases?
         self._model_aliases = model_aliases
 
@@ -278,6 +286,30 @@ def __init__(
         self.logging_callback = logging_callback
         self.new_run("spice")
 
+    async def call_llm(self, client: WrappedClient, call_args: SpiceCallArgs, streaming_callback: Optional[Callable[[str], None]] = None):
+        retries = 0
+        delay = self.base_delay
+        while retries <= self.max_retries:
+            try:
+                with client.catch_and_convert_errors():
+                    if streaming_callback is not None:
+                        stream = await client.get_chat_completion_or_stream(call_args)
+                        stream = cast(AsyncIterator, stream)
+                        streaming_spice_response = StreamingSpiceResponse(
+                            self._get_text_model(call_args.model), call_args, client, stream, None, streaming_callback
+                        )
+                        return await streaming_spice_response.complete_response()
+                    else:
+                        chat_completion = await client.get_chat_completion_or_stream(call_args)
+                        text, input_tokens, output_tokens = client.extract_text_and_tokens(chat_completion, call_args)
+                        return text, input_tokens, output_tokens
+            except (APIConnectionError, APIError) as e:
+                if retries == self.max_retries:
+                    raise e
+                time.sleep(min(delay, self.max_delay))
+                delay *= 2
+                retries += 1
+
     def new_run(self, name: str):
         """
         Create a new run. All llm calls will be logged in a folder with the run name and a timestamp.
@@ -451,23 +483,12 @@ async def get_response(
             elif i > 1 and call_args.temperature is not None:
                 call_args.temperature = max(0.5, call_args.temperature)
 
-            with client.catch_and_convert_errors():
-                if streaming_callback is not None:
-                    stream = await client.get_chat_completion_or_stream(call_args)
-                    stream = cast(AsyncIterator, stream)
-                    streaming_spice_response = StreamingSpiceResponse(
-                        text_model, call_args, client, stream, None, streaming_callback
-                    )
-                    chat_completion = await streaming_spice_response.complete_response()
-                    text, input_tokens, output_tokens = (
-                        chat_completion.text,
-                        chat_completion.input_tokens,
-                        chat_completion.output_tokens,
-                    )
-
-                else:
-                    chat_completion = await client.get_chat_completion_or_stream(call_args)
-                    text, input_tokens, output_tokens = client.extract_text_and_tokens(chat_completion, call_args)
+            try:
+                text, input_tokens, output_tokens = await self.call_llm(client, call_args, streaming_callback)
+            except (APIConnectionError, APIError) as e:
+                if i == retries:
+                    raise e
+                continue
 
             completion_cost = text_request_cost(text_model, input_tokens, output_tokens)
             if completion_cost is not None:
@@ -542,8 +563,11 @@ async def stream_response(
         client = self._get_client(text_model, provider)
         call_args = self._fix_call_args(messages, text_model, True, temperature, max_tokens, response_format)
 
-        with client.catch_and_convert_errors():
-            stream = await client.get_chat_completion_or_stream(call_args)
+        try:
+            stream = await self.call_llm(client, call_args, streaming_callback)
+        except (APIConnectionError, APIError) as e:
+            raise e
+
         stream = cast(AsyncIterator, stream)
 
         def callback(response: SpiceResponse, cache: List[float] = [0]):

diff --git a/spice/wrapped_clients.py b/spice/wrapped_clients.py
@@ -3,6 +3,7 @@
 import base64
 import io
 import mimetypes
+import time
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from pathlib import Path
@@ -120,7 +121,6 @@ def extract_text_and_tokens(self, chat_completion, call_args: SpiceCallArgs):
     @override
     @contextmanager
     def catch_and_convert_errors(self):
-        # TODO: Do we catch all errors? I think we should catch APIStatusError
         try:
             yield
         except openai.APIConnectionError as e:
@@ -398,7 +398,6 @@ def catch_and_convert_errors(self):
         except anthropic.APIStatusError as e:
             raise APIError(f"Anthropic Status Error: {e.message}") from e
 
-    # Anthropic doesn't give us a way to count tokens, so we just use OpenAI's token counting functions and multiply by a pre-determined multiplier
     class _FakeWrappedOpenAIClient(WrappedOpenAIClient):
         def __init__(self):
             pass