Audio: OpenAI API (#377)

* async requests enabled * aiohttp checks * fmt * update audio endpoint * update: route * rename cat sound * fix: docstring * update openapi * move jpg image
michaelfeil · Sep 24, 2024 · 978757d · 978757d
1 parent 8690ebc
commit 978757d
Show file tree

Hide file tree

Showing 10 changed files with 77 additions and 11 deletions.
diff --git a/docs/assets/openapi.json b/docs/assets/openapi.json
diff --git a/libs/embed_package/embed/_infer.py b/libs/embed_package/embed/_infer.py
@@ -111,7 +111,7 @@ def audio_embed(
         """Embed audios with a model.
 
         >>> import requests, io
-        >>> url =  "https://github.com/michaelfeil/infinity/raw/refs/heads/main/libs/infinity_emb/tests/data/audio/COMTran_Aerospacebeep1(ID2380)_BSB.wav"
+        >>> url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
         >>> ei = BatchedInference(model_id="laion/larger_clap_general", engine="torch")
         >>> audio_embed_result = ei.audio_embed(model_id="laion/larger_clap_general", audios=[url])
         >>> type(audio_embed_result)

diff --git a/libs/infinity_emb/Makefile b/libs/infinity_emb/Makefile
@@ -3,7 +3,7 @@
 # Default target executed when no arguments are given to make.
 all: help
 
-precommit : | format spell_fix spell_check lint poetry_check test cli_v2_docs
+precommit : | format spell_fix spell_check lint poetry_check cli_v2_docs test 
 
 ######################
 # TESTING AND COVERAGE

diff --git a/libs/infinity_emb/infinity_emb/engine.py b/libs/infinity_emb/infinity_emb/engine.py
@@ -231,7 +231,7 @@ async def audio_embed(
         """embed multiple audios
 
         Kwargs:
-            audios (list[npt.NDArray]): list of audio data, to be embedded
+            audios (list[Union[str, Audiobytes]]): list of audio data, to be embedded
 
         Raises:
             ValueError: raised if engine is not started yet
@@ -240,7 +240,7 @@ async def audio_embed(
 
         Returns:
             list[EmbeddingReturnType]: embeddings
-                2D list-array of shape( len(sentences),embed_dim )
+                2D list-array of shape( len(sentences), embed_dim )
             int: token usage
         """
 

diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
@@ -33,6 +33,10 @@
             "min_length": 1,
             "max_length": 2048,
         }
+        ITEMS_LIMIT_SMALL = {
+            "min_length": 1,
+            "max_length": 32,
+        }
     except ImportError:
         from pydantic import constr
 
@@ -41,6 +45,10 @@
             "min_items": 1,
             "max_items": 2048,
         }
+        ITEMS_LIMIT_SMALL = {
+            "min_items": 1,
+            "max_items": 32,
+        }
         HttpUrl, AnyUrl = str, str  # type: ignore
 else:
 
@@ -76,7 +84,7 @@ class ImageEmbeddingInput(BaseModel):
     input: Union[  # type: ignore
         conlist(  # type: ignore
             Annotated[AnyUrl, HttpUrl],
-            **ITEMS_LIMIT,
+            **ITEMS_LIMIT_SMALL,
         ),
         Annotated[AnyUrl, HttpUrl],
     ]
@@ -85,6 +93,10 @@ class ImageEmbeddingInput(BaseModel):
     user: Optional[str] = None
 
 
+class AudioEmbeddingInput(ImageEmbeddingInput):
+    pass
+
+
 class _EmbeddingObject(BaseModel):
     object: Literal["embedding"] = "embedding"
     embedding: Union[list[float], bytes]

diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -16,6 +16,7 @@
 from infinity_emb.env import MANAGER
 from infinity_emb.fastapi_schemas import docs, errors
 from infinity_emb.fastapi_schemas.pymodels import (
+    AudioEmbeddingInput,
     ClassifyInput,
     ClassifyResult,
     ImageEmbeddingInput,
@@ -27,6 +28,7 @@
 )
 from infinity_emb.log_handler import UVICORN_LOG_LEVELS, logger
 from infinity_emb.primitives import (
+    AudioCorruption,
     Device,
     Dtype,
     EmbeddingDtype,
@@ -349,7 +351,7 @@ async def _classify(data: ClassifyInput):
         operation_id="embeddings_image",
     )
     async def _embeddings_image(data: ImageEmbeddingInput):
-        """Encode Embeddings
+        """Encode Embeddings from Image files
 
         ```python
         import requests
@@ -384,7 +386,59 @@ async def _embeddings_image(data: ImageEmbeddingInput):
             )
         except ModelNotDeployedError as ex:
             raise errors.OpenAIException(
-                f"ModelNotDeployedError: model=`{data.model}` does not support `embed`. Reason: {ex}",
+                f"ModelNotDeployedError: model=`{data.model}` does not support `image_embed`. Reason: {ex}",
+                code=status.HTTP_400_BAD_REQUEST,
+            )
+        except Exception as ex:
+            raise errors.OpenAIException(
+                f"InternalServerError: {ex}",
+                code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            )
+
+    @app.post(
+        f"{url_prefix}/embeddings_audio",
+        response_model=OpenAIEmbeddingResult,
+        response_class=responses.ORJSONResponse,
+        dependencies=route_dependencies,
+        operation_id="embeddings_audio",
+    )
+    async def _embeddings_audio(data: AudioEmbeddingInput):
+        """Encode Embeddings from Audio files
+
+        ```python
+        import requests
+        requests.post("http://..:7997/embeddings_audio",
+            json={"model":"laion/larger_clap_general","input":["https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"]})
+        """
+        engine = _resolve_engine(data.model)
+        if hasattr(data.input, "host"):
+            # if it is a single url
+            audio_inputs = [str(data.input)]
+        else:
+            audio_inputs = [str(d) for d in data.input]  # type: ignore
+        try:
+            logger.debug("[📝] Received request with %s Urls ", len(audio_inputs))
+            start = time.perf_counter()
+
+            embedding, usage = await engine.audio_embed(audios=audio_inputs)  # type: ignore
+
+            duration = (time.perf_counter() - start) * 1000
+            logger.debug("[✅] Done in %s ms", duration)
+
+            return OpenAIEmbeddingResult.to_embeddings_response(
+                embeddings=embedding,
+                engine_args=engine.engine_args,
+                encoding_format=data.encoding_format,
+                usage=usage,
+            )
+        except AudioCorruption as ex:
+            raise errors.OpenAIException(
+                f"AudioCorruption, could not open {audio_inputs} -> {ex}",
+                code=status.HTTP_400_BAD_REQUEST,
+            )
+        except ModelNotDeployedError as ex:
+            raise errors.OpenAIException(
+                f"ModelNotDeployedError: model=`{data.model}` does not support `audio_embed`. Reason: {ex}",
                 code=status.HTTP_400_BAD_REQUEST,
             )
         except Exception as ex:

diff --git a/...io/COMTran_Aerospacebeep1(ID2380)_BSB.wav → libs/infinity_emb/tests/data/audio/beep.wav b/...io/COMTran_Aerospacebeep1(ID2380)_BSB.wav → libs/infinity_emb/tests/data/audio/beep.wav
diff --git a/...ANMLCat_Smallmewingofacat(ID0098)_BSB.wav → ...nfinity_emb/tests/data/audio/cat_meow.wav b/...ANMLCat_Smallmewingofacat(ID0098)_BSB.wav → ...nfinity_emb/tests/data/audio/cat_meow.wav
diff --git a/libs/infinity_emb/tests/unit_test/test_engine.py b/libs/infinity_emb/tests/unit_test/test_engine.py
@@ -192,7 +192,7 @@ async def test_clap_like_model():
     engine = AsyncEmbeddingEngine.from_args(
         EngineArgs(model_name_or_path=model_name, dtype="float32")
     )
-    url = "https://github.com/michaelfeil/infinity/raw/refs/heads/main/libs/infinity_emb/tests/data/audio/COMTran_Aerospacebeep1(ID2380)_BSB.wav"
+    url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
     bytes_url = requests.get(url).content
 
     inputs = ["a sound of a cat", "a sound of a cat"]
@@ -211,7 +211,7 @@ async def test_clap_like_model():
 
 @pytest.mark.anyio
 async def test_clip_embed_pil_image_input():
-    img_url = "https://github.com/michaelfeil/infinity/raw/65afe2b3d68fda10429bf7f215fe645be20788e4/docs/assets/cats_coco_sample.jpg"
+    img_url = "https://github.com/michaelfeil/infinity/raw/06fd1f4d8f0a869f4482fc1c78b62a75ccbb66a1/docs/assets/cats_coco_sample.jpg"
     response = requests.get(img_url, stream=True)
     assert response.status_code == 200
     img_data = response.raw

diff --git a/libs/infinity_emb/tests/unit_test/transformer/audio/test_audio.py b/libs/infinity_emb/tests/unit_test/transformer/audio/test_audio.py
@@ -15,7 +15,7 @@ def test_clap_like_model():
     model = ClapLikeModel(
         engine_args=EngineArgs(model_name_or_path=model_name, dtype="float16")
     )
-    url = "https://github.com/michaelfeil/infinity/raw/refs/heads/main/libs/infinity_emb/tests/data/audio/COMTran_Aerospacebeep1(ID2380)_BSB.wav"
+    url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
     raw_bytes = requests.get(url, stream=True).content
     data, samplerate = sf.read(io.BytesIO(raw_bytes))