From 196f07aa874eaf51bbc0ac531f2930da8c6577fa Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 1 Mar 2024 06:42:26 +0100
Subject: [PATCH 1/9] refacored + syncio

---
 .github/workflows/quality_checks.yaml |  34 ++++++
 .github/workflows/release.yaml        |   2 +-
 .github/workflows/tests.yaml          |   5 +-
 example.py                            |  31 ++---
 py_tgi/__init__.py                    |   3 -
 py_tgi/inference_server.py            | 143 -----------------------
 py_tgi/text_embedding_inference.py    | 144 -----------------------
 py_tgi/text_generation_inference.py   | 144 -----------------------
 py_txi/__init__.py                    |   3 +
 py_txi/docker_inference_server.py     | 162 ++++++++++++++++++++++++++
 py_txi/text_embedding_inference.py    |  56 +++++++++
 py_txi/text_generation_inference.py   |  84 +++++++++++++
 {py_tgi => py_txi}/utils.py           |   9 +-
 setup.py                              |   2 +-
 tests/test.py                         |  31 ++---
 15 files changed, 384 insertions(+), 469 deletions(-)
 create mode 100644 .github/workflows/quality_checks.yaml
 delete mode 100644 py_tgi/__init__.py
 delete mode 100644 py_tgi/inference_server.py
 delete mode 100644 py_tgi/text_embedding_inference.py
 delete mode 100644 py_tgi/text_generation_inference.py
 create mode 100644 py_txi/__init__.py
 create mode 100644 py_txi/docker_inference_server.py
 create mode 100644 py_txi/text_embedding_inference.py
 create mode 100644 py_txi/text_generation_inference.py
 rename {py_tgi => py_txi}/utils.py (67%)

diff --git a/.github/workflows/quality_checks.yaml b/.github/workflows/quality_checks.yaml
new file mode 100644
index 0000000..a7bc11e
--- /dev/null
+++ b/.github/workflows/quality_checks.yaml
@@ -0,0 +1,34 @@
+name: quality checks
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run_quality_checks:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+
+      - name: Install quality requirements
+        run: |
+          pip install --upgrade pip
+          pip install -e .[quality]
+
+      - name: Check style
+        run: |
+          make quality
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 25b596f..6b86b5e 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -5,7 +5,7 @@ on:
     types: [created]
 
 jobs:
-  deploy:
+  release:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index f6b283b..0f73035 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -27,7 +27,8 @@ jobs:
       - name: Install requirements
         run: |
           pip install --upgrade pip
-          pip install -e .
+          pip install -e .[testing]
 
       - name: Run test
-        run: python tests/test.py
+        run: |
+          pytest tests/
diff --git a/example.py b/example.py
index 1c63620..42f6557 100644
--- a/example.py
+++ b/example.py
@@ -1,20 +1,21 @@
-from py_tgi import TEI, TGI, is_nvidia_system, is_rocm_system
+from py_txi.text_embedding_inference import TEI, TEIConfig
+from py_txi.text_generation_inference import TGI, TGIConfig
+from py_txi.utils import get_free_port
 
-if is_nvidia_system():
-    llm = TGI(model="NousResearch/Llama-2-7b-hf", gpus="all", port=1234)
-elif is_rocm_system():
-    llm = TGI(model="NousResearch/Llama-2-7b-hf", devices=["/dev/kfd", "/dev/dri"], port=1234)
-else:
-    llm = TGI(model="NousResearch/Llama-2-7b-hf", port=1234)
+port = get_free_port()
+ports = {"80/tcp": ("127.0.0.1", port)}
 
+tei_config = TEIConfig(pooling="cls", ports=ports)
+embed = TEI(tei_config)
+output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"])
+print("Embed:", output)
+embed.close()
 
+port = get_free_port()
+ports = {"80/tcp": ("127.0.0.1", port)}
+
+tgi_config = TGIConfig(ports=ports)
+llm = TGI(tgi_config)
 output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"])
 print("LLM:", output)
-
-if is_nvidia_system():
-    embed = TEI(model="BAAI/bge-large-en-v1.5", dtype="float16", pooling="mean", gpus="all", port=4321)
-else:
-    embed = TEI(model="BAAI/bge-large-en-v1.5", dtype="float16", pooling="mean", port=4321)
-
-output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"])
-print("Embed:", output)
+llm.close()
diff --git a/py_tgi/__init__.py b/py_tgi/__init__.py
deleted file mode 100644
index 870eedd..0000000
--- a/py_tgi/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .text_embedding_inference import TEI  # noqa
-from .text_generation_inference import TGI  # noqa
-from .utils import is_nvidia_system, is_rocm_system  # noqa
diff --git a/py_tgi/inference_server.py b/py_tgi/inference_server.py
deleted file mode 100644
index 95eb817..0000000
--- a/py_tgi/inference_server.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import os
-import re
-from abc import ABC
-from logging import INFO, basicConfig, getLogger
-from typing import Any, Dict, List, Optional, Union
-
-import docker
-import docker.errors
-import docker.types
-
-from .utils import HF_CACHE_DIR
-
-basicConfig(level=INFO)
-
-
-DOCKER = docker.from_env()
-LOGGER = getLogger("inference-server")
-
-
-class InferenceServer(ABC):
-    NAME: str = "Inference-Server"
-
-    def __init__(
-        self,
-        # model options
-        model: str,
-        revision: str,
-        # image options
-        image: str,
-        # docker options
-        port: int = 1111,
-        shm_size: str = "1g",
-        address: str = "127.0.0.1",
-        volumes: Dict[str, Any] = {HF_CACHE_DIR: "/data"},  # connects local hf cache to /data folder
-        gpus: Optional[Union[str, int]] = None,  # e.g. "all" or "0,1,2,3" or 4 for NVIDIA
-        devices: Optional[List[str]] = None,  # e.g. ["/dev/kfd", "/dev/dri"] for ROCm
-        # launcher options
-        **kwargs,
-    ) -> None:
-        # model options
-        self.model = model
-        self.revision = revision
-        # docker options
-        self.port = port
-        self.image = image
-        self.volumes = volumes
-        self.address = address
-        self.shm_size = shm_size
-        # device options
-        self.gpus = gpus
-        self.devices = devices
-
-        try:
-            LOGGER.info(f"\t+ Checking if {self.NAME} image is available locally")
-            DOCKER.images.get(self.image)
-            LOGGER.info(f"\t+ {self.NAME} image found locally")
-        except docker.errors.ImageNotFound:
-            LOGGER.info(f"\t+ {self.NAME} image not found locally, pulling from Docker Hub")
-            DOCKER.images.pull(self.image)
-
-        LOGGER.info(f"\t+ Building {self.NAME} URL")
-        self.build_url()
-
-        LOGGER.info(f"\t+ Building {self.NAME} environment")
-        self.build_env()
-
-        LOGGER.info(f"\t+ Building {self.NAME} devices")
-        self.build_devices()
-
-        LOGGER.info(f"\t+ Building {self.NAME} command")
-        self.build_command()
-
-        LOGGER.info(f"\t+ Running {self.NAME} server")
-        self.run_container()
-
-        LOGGER.info(f"\t+ Waiting for {self.NAME} server to be ready")
-        self.wait()
-
-        LOGGER.info(f"\t+ Connecting to {self.NAME} server")
-        self.connect_client()
-
-    def run_container(self):
-        self.container = DOCKER.containers.run(
-            image=self.image,
-            command=self.command,
-            shm_size=self.shm_size,
-            ports={"80/tcp": (self.address, self.port)},
-            volumes={source: {"bind": target, "mode": "rw"} for source, target in self.volumes.items()},
-            device_requests=self.device_requests,
-            devices=self.devices,
-            environment=self.env,
-            auto_remove=True,
-            detach=True,
-        )
-
-    def wait(self):
-        raise NotImplementedError
-
-    def connect_client(self):
-        raise NotImplementedError
-
-    def build_devices(self):
-        if self.gpus is not None and isinstance(self.gpus, str) and self.gpus == "all":
-            LOGGER.info("\t+ Using all GPU(s)")
-            self.device_requests = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])]
-        elif self.gpus is not None and isinstance(self.gpus, int):
-            LOGGER.info(f"\t+ Using {self.gpus} GPU(s)")
-            self.device_requests = [docker.types.DeviceRequest(count=self.gpus, capabilities=[["gpu"]])]
-        elif self.gpus is not None and isinstance(self.gpus, str) and re.match(r"^\d+(,\d+)*$", self.gpus):
-            LOGGER.info(f"\t+ Using GPU(s) {self.gpus}")
-            self.device_requests = [docker.types.DeviceRequest(device_ids=[self.gpus], capabilities=[["gpu"]])]
-        else:
-            LOGGER.info("\t+ Not using any GPU(s)")
-            self.device_requests = None
-
-        if self.devices is not None and isinstance(self.devices, list) and all(os.path.exists(d) for d in self.devices):
-            LOGGER.info(f"\t+ Using custom device(s) {self.devices}")
-            self.devices = self.devices
-        else:
-            LOGGER.info("\t+ Not using any custom device(s)")
-            self.devices = None
-
-    def build_url(self):
-        self.url = f"http://{self.address}:{self.port}"
-
-    def build_env(self):
-        self.env = {}
-
-    def build_command(self):
-        self.command = []
-
-    def close(self) -> None:
-        if hasattr(self, "container"):
-            LOGGER.info("\t+ Stoping Docker container")
-            self.container.stop()
-            self.container.wait()
-            LOGGER.info("\t+ Docker container stopped")
-
-    def __del__(self):
-        try:
-            self.close()
-        except Exception:
-            pass
diff --git a/py_tgi/text_embedding_inference.py b/py_tgi/text_embedding_inference.py
deleted file mode 100644
index 42f2c41..0000000
--- a/py_tgi/text_embedding_inference.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import os
-import time
-from concurrent.futures import ThreadPoolExecutor
-from logging import getLogger
-from typing import Any, Dict, List, Literal, Optional, Union
-
-import numpy as np
-from huggingface_hub import InferenceClient
-
-from .inference_server import InferenceServer
-from .utils import CONNECTION_TIMEOUT, HF_CACHE_DIR
-
-LOGGER = getLogger("TEI")
-
-
-Pooling_Literal = Literal["cls", "mean"]
-DType_Literal = Literal["float32", "float16"]
-
-
-class TEI(InferenceServer):
-    NAME: str = "Text-Embedding-Inference"
-
-    def __init__(
-        self,
-        # model options
-        model: str,
-        revision: str = "main",
-        # image options
-        image: str = "ghcr.io/huggingface/text-embeddings-inference:latest",
-        # docker options
-        port: int = 1111,
-        shm_size: str = "1g",
-        address: str = "127.0.0.1",
-        volumes: Dict[str, Any] = {HF_CACHE_DIR: "/data"},  # connects local hf cache to /data folder
-        devices: Optional[List[str]] = None,  # e.g. ["/dev/kfd", "/dev/dri"] for ROCm
-        gpus: Optional[Union[str, int]] = None,  # e.g. "all" or "0,1,2,3" or 4 for NVIDIA
-        # launcher options
-        # tgi launcher options
-        dtype: Optional[DType_Literal] = None,
-        pooling: Optional[Pooling_Literal] = None,
-        tokenization_workers: Optional[int] = None,
-        max_concurrent_requests: Optional[int] = None,
-        max_batch_tokens: Optional[int] = None,
-        max_batch_requests: Optional[int] = None,
-        max_client_batch_size: Optional[int] = None,
-    ) -> None:
-        # tgi launcher options
-        self.dtype = dtype
-        self.pooling = pooling
-        self.tokenization_workers = tokenization_workers
-        self.max_concurrent_requests = max_concurrent_requests
-        self.max_batch_tokens = max_batch_tokens
-        self.max_batch_requests = max_batch_requests
-        self.max_client_batch_size = max_client_batch_size
-
-        if gpus is None and "cpu-" not in image:
-            LOGGER.warning("No GPUs were specified, but the image does not contain 'cpu-'. Adding it.")
-            image_, tag_ = image.split(":")
-            image = f"{image_}:cpu-{tag_}"
-
-        super().__init__(
-            model=model,
-            revision=revision,
-            image=image,
-            port=port,
-            shm_size=shm_size,
-            address=address,
-            volumes=volumes,
-            devices=devices,
-            gpus=gpus,
-        )
-
-    def wait(self):
-        for line in self.container.logs(stream=True):
-            log = line.decode("utf-8").strip()
-            if "Ready" in log:
-                LOGGER.info(f"\t {log}")
-                break
-            elif "Error" in log:
-                LOGGER.info(f"\t {log}")
-                raise Exception(f"{self.NAME} server failed to start")
-            else:
-                LOGGER.info(f"\t {log}")
-
-    def connect_client(self):
-        start_time = time.time()
-        while time.time() - start_time < CONNECTION_TIMEOUT:
-            try:
-                self.client = InferenceClient(model=self.url)
-                self.client.feature_extraction("Hello world!")
-                LOGGER.info(f"\t+ Connected to {self.NAME} server successfully")
-                return
-            except Exception:
-                LOGGER.info(f"\t+ {self.NAME} server is not ready yet, waiting 1 second")
-                time.sleep(1)
-
-        raise Exception(f"{self.NAME} server took too long to start (60 seconds)")
-
-    def build_command(self):
-        self.command = ["--model-id", self.model, "--revision", self.revision]
-        if self.dtype:
-            self.command.extend(["--dtype", self.dtype])
-        if self.pooling:
-            self.command.extend(["--pooling", self.pooling])
-        if self.tokenization_workers:
-            self.command.extend(["--tokenization-workers", str(self.tokenization_workers)])
-        if self.max_concurrent_requests:
-            self.command.extend(["--max-concurrent-requests", str(self.max_concurrent_requests)])
-        if self.max_batch_tokens:
-            self.command.extend(["--max-batch-tokens", str(self.max_batch_tokens)])
-        if self.max_batch_requests:
-            self.command.extend(["--max-batch-requests", str(self.max_batch_requests)])
-        if self.max_client_batch_size:
-            self.command.extend(["--max-client-batch-size", str(self.max_client_batch_size)])
-
-    def build_env(self):
-        self.env = {}
-        if os.environ.get("HUGGING_FACE_HUB_TOKEN", None) is not None:
-            self.env["HUGGING_FACE_HUB_TOKEN"] = os.environ["HUGGING_FACE_HUB_TOKEN"]
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        return cls(*args, **kwargs)
-
-    def encode(self, text: Union[str, List[str]], **kwargs) -> Union[np.ndarray, List[np.ndarray]]:
-        if isinstance(text, str):
-            output = self.client.feature_extraction(text=text, **kwargs)
-            return output
-
-        elif isinstance(text, list):
-            outputs = []
-
-            with ThreadPoolExecutor(max_workers=len(text)) as executor:
-                futures = [
-                    executor.submit(self.client.feature_extraction, text=text[i], **kwargs) for i in range(len(text))
-                ]
-
-            for i in range(len(text)):
-                outputs.append(futures[i].result())
-
-            return outputs
-
-    def __call__(self, text: Union[str, List[str]], **kwargs) -> Union[np.ndarray, List[np.ndarray]]:
-        return self.encode(text, **kwargs)
diff --git a/py_tgi/text_generation_inference.py b/py_tgi/text_generation_inference.py
deleted file mode 100644
index 0e3cf94..0000000
--- a/py_tgi/text_generation_inference.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import os
-import time
-from concurrent.futures import ThreadPoolExecutor
-from logging import getLogger
-from typing import Any, Dict, List, Literal, Optional, Union
-
-from huggingface_hub import InferenceClient
-from huggingface_hub.inference._text_generation import TextGenerationResponse
-
-from .inference_server import InferenceServer
-from .utils import CONNECTION_TIMEOUT, HF_CACHE_DIR, is_rocm_system
-
-LOGGER = getLogger("TGI")
-
-
-DType_Literal = Literal["float32", "float16", "bfloat16"]
-Quantize_Literal = Literal["bitsandbytes-nf4", "bitsandbytes-fp4", "gptq"]
-
-
-class TGI(InferenceServer):
-    NAME: str = "Text-Generation-Inference"
-
-    def __init__(
-        self,
-        # model options
-        model: str,
-        revision: str = "main",
-        # image options
-        image: str = "ghcr.io/huggingface/text-generation-inference:latest",
-        # docker options
-        port: int = 1111,
-        shm_size: str = "1g",
-        address: str = "127.0.0.1",
-        volumes: Dict[str, Any] = {HF_CACHE_DIR: "/data"},  # connects local hf cache to /data folder
-        devices: Optional[List[str]] = None,  # e.g. ["/dev/kfd", "/dev/dri"] for ROCm
-        gpus: Optional[Union[str, int]] = None,  # e.g. "all" or "0,1,2,3" or 4 for NVIDIA
-        # launcher options
-        # tgi launcher options
-        sharded: Optional[bool] = None,
-        num_shard: Optional[int] = None,
-        dtype: Optional[DType_Literal] = None,
-        quantize: Optional[Quantize_Literal] = None,
-        trust_remote_code: Optional[bool] = False,
-        disable_custom_kernels: Optional[bool] = False,
-    ) -> None:
-        # tgi launcher options
-        self.dtype = dtype
-        self.sharded = sharded
-        self.quantize = quantize
-        self.num_shard = num_shard
-        self.trust_remote_code = trust_remote_code
-        self.disable_custom_kernels = disable_custom_kernels
-
-        if devices and is_rocm_system() and "-rocm" not in image:
-            LOGGER.warning("ROCm system detected, but the image does not contain '-rocm'. Adding it.")
-            image = image + "-rocm"
-
-        super().__init__(
-            model=model,
-            revision=revision,
-            image=image,
-            port=port,
-            shm_size=shm_size,
-            address=address,
-            volumes=volumes,
-            devices=devices,
-            gpus=gpus,
-        )
-
-    def wait(self):
-        for line in self.container.logs(stream=True):
-            log = line.decode("utf-8").strip()
-            if "Connected" in log:
-                LOGGER.info(f"\t {log}")
-                break
-            elif "Error" in log:
-                LOGGER.info(f"\t {log}")
-                raise Exception(f"{self.NAME} server failed to start")
-            else:
-                LOGGER.info(f"\t {log}")
-
-    def connect_client(self):
-        start_time = time.time()
-        while time.time() - start_time < CONNECTION_TIMEOUT:
-            try:
-                self.client = InferenceClient(model=self.url)
-                self.client.text_generation("Hello world!")
-                LOGGER.info(f"\t+ Connected to {self.NAME} server successfully")
-                return
-            except Exception:
-                LOGGER.info(f"\t+ {self.NAME} server is not ready yet, waiting 1 second")
-                time.sleep(1)
-
-        raise Exception(f"{self.NAME} server took too long to start (60 seconds)")
-
-    def build_command(self):
-        self.command = ["--model-id", self.model, "--revision", self.revision]
-        if self.sharded is not None:
-            self.command.extend(["--sharded", str(self.sharded).lower()])
-        if self.num_shard is not None:
-            self.command.extend(["--num-shard", str(self.num_shard)])
-        if self.quantize is not None:
-            self.command.extend(["--quantize", self.quantize])
-        if self.dtype is not None:
-            self.command.extend(["--dtype", self.dtype])
-
-        if self.trust_remote_code:
-            self.command.append("--trust-remote-code")
-        if self.disable_custom_kernels:
-            self.command.append("--disable-custom-kernels")
-
-    def build_env(self):
-        self.env = {}
-        if os.environ.get("HUGGING_FACE_HUB_TOKEN", None) is not None:
-            self.env["HUGGING_FACE_HUB_TOKEN"] = os.environ["HUGGING_FACE_HUB_TOKEN"]
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        return cls(*args, **kwargs)
-
-    def generate(
-        self, prompt: Union[str, List[str]], **kwargs
-    ) -> Union[TextGenerationResponse, List[TextGenerationResponse]]:
-        if isinstance(prompt, str):
-            output = self.client.text_generation(prompt=prompt, **kwargs)
-            return output
-
-        elif isinstance(prompt, list):
-            outputs = []
-
-            with ThreadPoolExecutor(max_workers=len(prompt)) as executor:
-                futures = [
-                    executor.submit(self.client.text_generation, prompt=prompt[i], **kwargs) for i in range(len(prompt))
-                ]
-
-            for i in range(len(prompt)):
-                outputs.append(futures[i].result())
-
-            return outputs
-
-    def __call__(
-        self, prompt: Union[str, List[str]], **kwargs
-    ) -> Union[TextGenerationResponse, List[TextGenerationResponse]]:
-        return self.generate(prompt, **kwargs)
diff --git a/py_txi/__init__.py b/py_txi/__init__.py
new file mode 100644
index 0000000..16c2c4f
--- /dev/null
+++ b/py_txi/__init__.py
@@ -0,0 +1,3 @@
+# from .text_embedding_inference import TEI  # noqa
+# from .text_generation_inference import TGI  # noqa
+# from .utils import is_nvidia_system, is_rocm_system  # noqa
diff --git a/py_txi/docker_inference_server.py b/py_txi/docker_inference_server.py
new file mode 100644
index 0000000..bd8302e
--- /dev/null
+++ b/py_txi/docker_inference_server.py
@@ -0,0 +1,162 @@
+import asyncio
+import os
+import re
+import time
+from abc import ABC
+from dataclasses import asdict, dataclass, field
+from logging import INFO, basicConfig, getLogger
+from typing import Any, Dict, List, Optional, Union
+
+import docker
+import docker.errors
+import docker.types
+from huggingface_hub import AsyncInferenceClient
+
+from .utils import get_free_port
+
+basicConfig(level=INFO)
+
+DOCKER = docker.from_env()
+LOGGER = getLogger("docker-inference-server")
+
+
+@dataclass
+class DockerInferenceServerConfig:
+    # Image to use for the container
+    image: str
+    # Shared memory size for the container
+    shm_size: str = "1g"
+    # List of custom devices to forward to the container e.g. ["/dev/kfd", "/dev/dri"] for ROCm
+    devices: Optional[List[str]] = None
+    # NVIDIA-docker GPU device options e.g. "all" (all) or "0,1,2,3" (ids) or 4 (count)
+    gpus: Optional[Union[str, int]] = None
+
+    ports: Dict[str, Any] = field(
+        default_factory=lambda: {"80/tcp": ("127.0.0.1", 0)},
+        metadata={"help": "Dictionary of ports to expose from the container."},
+    )
+    volumes: Dict[str, Any] = field(
+        default_factory=lambda: {os.path.expanduser("~/.cache/huggingface/hub"): {"bind": "/data", "mode": "rw"}},
+        metadata={"help": "Dictionary of volumes to mount inside the container."},
+    )
+    environment: Dict[str, str] = field(
+        default_factory=lambda: {"HUGGINGFACE_HUB_TOKEN": os.environ.get("HUGGINGFACE_HUB_TOKEN", "")},
+        metadata={"help": "Dictionary of environment variables to forward to the container."},
+    )
+
+    timeout: int = 60
+
+    def __post_init__(self) -> None:
+        if self.ports["80/tcp"][1] == 0:
+            LOGGER.info("\t+ Getting a free port for the server")
+            self.ports["80/tcp"] = (self.ports["80/tcp"][0], get_free_port())
+
+
+class DockerInferenceServer(ABC):
+    NAME: str = "Docker-Inference-Server"
+    SUCCESS_SENTINEL: str = "Success"
+    FAILURE_SENTINEL: str = "Failure"
+
+    def __init__(self, config: DockerInferenceServerConfig) -> None:
+        self.config = config
+
+        try:
+            LOGGER.info(f"\t+ Checking if {self.NAME} image is available locally")
+            DOCKER.images.get(self.config.image)
+            LOGGER.info(f"\t+ {self.NAME} image found locally")
+        except docker.errors.ImageNotFound:
+            LOGGER.info(f"\t+ {self.NAME} image not found locally, pulling from Docker Hub")
+            DOCKER.images.pull(self.config.image)
+
+        if self.config.gpus is not None and isinstance(self.config.gpus, str) and self.config.gpus == "all":
+            LOGGER.info("\t+ Using all GPU(s)")
+            self.device_requests = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])]
+        elif self.config.gpus is not None and isinstance(self.config.gpus, int):
+            LOGGER.info(f"\t+ Using {self.config.gpus} GPU(s)")
+            self.device_requests = [docker.types.DeviceRequest(count=self.config.gpus, capabilities=[["gpu"]])]
+        elif (
+            self.config.gpus is not None
+            and isinstance(self.config.gpus, str)
+            and re.match(r"^\d+(,\d+)*$", self.config.gpus)
+        ):
+            LOGGER.info(f"\t+ Using GPU(s) {self.config.gpus}")
+            self.device_requests = [docker.types.DeviceRequest(device_ids=[self.config.gpus], capabilities=[["gpu"]])]
+        else:
+            LOGGER.info("\t+ Not using any GPU(s)")
+            self.device_requests = None
+
+        LOGGER.info(f"\t+ Building {self.NAME} command")
+        self.command = []
+        for k, v in asdict(self.config).items():
+            if k in DockerInferenceServerConfig.__annotations__:
+                continue
+            elif v is not None:
+                if isinstance(v, bool):
+                    self.command.append(f"--{k.replace('_', '-')}")
+                else:
+                    self.command.append(f"--{k.replace('_', '-')}={v}")
+
+        address, port = self.config.ports["80/tcp"]
+        self.url = f"http://{address}:{port}"
+
+        LOGGER.info(f"\t+ Running {self.NAME} container")
+        self.container = DOCKER.containers.run(
+            image=self.config.image,
+            ports=self.config.ports,
+            volumes=self.config.volumes,
+            devices=self.config.devices,
+            shm_size=self.config.shm_size,
+            environment=self.config.environment,
+            device_requests=self.device_requests,
+            command=self.command,
+            auto_remove=True,
+            detach=True,
+        )
+
+        LOGGER.info(f"\t+ Streaming {self.NAME} server logs")
+        for line in self.container.logs(stream=True):
+            log = line.decode("utf-8").strip()
+            if self.SUCCESS_SENTINEL.lower() in log.lower():
+                LOGGER.info(f"\t {log}")
+                break
+            elif self.FAILURE_SENTINEL.lower() in log.lower():
+                LOGGER.info(f"\t {log}")
+                raise Exception(f"{self.NAME} server failed to start")
+            else:
+                LOGGER.info(f"\t {log}")
+
+        LOGGER.info(f"\t+ Waiting for {self.NAME} server to be ready")
+        start_time = time.time()
+        while time.time() - start_time < self.config.timeout:
+            try:
+                if not hasattr(self, "client"):
+                    self.client = AsyncInferenceClient(model=self.url)
+
+                asyncio.run(self.single_client_call(f"Hello {self.NAME}!"))
+                LOGGER.info(f"\t+ Connected to {self.NAME} server successfully")
+                break
+            except Exception:
+                LOGGER.info(f"\t+ {self.NAME} server is not ready yet, waiting 1 second")
+                time.sleep(1)
+
+    async def single_client_call(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
+
+    async def batch_client_call(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
+
+    def close(self) -> None:
+        if hasattr(self, "container"):
+            LOGGER.info("\t+ Stoping Docker container")
+            self.container.stop()
+            self.container.wait()
+            del self.container
+            LOGGER.info("\t+ Docker container stopped")
+
+        if hasattr(self, "client"):
+            LOGGER.info("\t+ Stoping Inference client")
+            del self.client
+            LOGGER.info("\t+ Inference client stopped")
+
+    def __del__(self) -> None:
+        self.close()
diff --git a/py_txi/text_embedding_inference.py b/py_txi/text_embedding_inference.py
new file mode 100644
index 0000000..38e19d1
--- /dev/null
+++ b/py_txi/text_embedding_inference.py
@@ -0,0 +1,56 @@
+import asyncio
+from dataclasses import dataclass
+from logging import getLogger
+from typing import List, Literal, Optional, Union
+
+import numpy as np
+
+from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig
+
+LOGGER = getLogger("TEI")
+
+
+Pooling_Literal = Literal["cls", "mean"]
+DType_Literal = Literal["float32", "float16"]
+
+
+@dataclass(order=False)
+class TEIConfig(DockerInferenceServerConfig):
+    # Docker options
+    image: str = "ghcr.io/huggingface/text-embeddings-inference:cpu-latest"
+    # Launcher options
+    model_id: str = "bert-base-uncased"
+    revision: str = "main"
+    dtype: Optional[DType_Literal] = None
+    pooling: Optional[Pooling_Literal] = None
+    tokenization_workers: Optional[int] = None
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+
+
+class TEI(DockerInferenceServer):
+    NAME: str = "Text-Embedding-Inference"
+    SUCCESS_SENTINEL: str = "Ready"
+    FAILURE_SENTINEL: str = "Error"
+
+    def __init__(self, config: TEIConfig) -> None:
+        super().__init__(config)
+
+    async def single_client_call(self, text: str, **kwargs) -> np.ndarray:
+        output = await self.client.feature_extraction(text=text, **kwargs)
+        return output
+
+    async def batch_client_call(self, text: List[str], **kwargs) -> List[np.ndarray]:
+        output = await asyncio.gather(*[self.single_client_call(t, **kwargs) for t in text])
+        return output
+
+    def encode(self, text: Union[str, List[str]], **kwargs) -> Union[np.ndarray, List[np.ndarray]]:
+        if isinstance(text, str):
+            output = asyncio.run(self.single_client_call(text, **kwargs))
+            return output
+        elif isinstance(text, list):
+            output = asyncio.run(self.batch_client_call(text, **kwargs))
+            return output
+        else:
+            raise ValueError(f"Unsupported input type: {type(text)}")
diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py
new file mode 100644
index 0000000..10bad10
--- /dev/null
+++ b/py_txi/text_generation_inference.py
@@ -0,0 +1,84 @@
+import asyncio
+from dataclasses import dataclass
+from logging import getLogger
+from typing import Literal, Optional, Union
+
+from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig
+
+LOGGER = getLogger("TGI")
+
+
+DType_Literal = Literal["float32", "float16", "bfloat16"]
+Quantize_Literal = Literal["bitsandbytes-nf4", "bitsandbytes-fp4", "gptq"]
+
+
+@dataclass
+class TGIConfig(DockerInferenceServerConfig):
+    # { model_id: "gpt2", revision: Some("main"), validation_workers: 2, sharded: None, num_shard: None, quantize: None, speculate: None, dtype: None, trust_remote_code: false, max_concurrent_requests: 128, max_best_of: 2, max_stop_sequences: 4, max_top_n_tokens: 5, max_input_length: 1024, max_total_tokens: 2048, waiting_served_ratio: 1.2, max_batch_prefill_tokens: 4096, max_batch_total_tokens: None, max_waiting_tokens: 20, max_batch_size: None, enable_cuda_graphs: false, hostname: "6fedb07983ae", port: 80, shard_uds_path: "/tmp/text-generation-server", master_addr: "localhost", master_port: 29500, huggingface_hub_cache: Some("/data"), weights_cache_override: None, disable_custom_kernels: false, cuda_memory_fraction: 1.0, rope_scaling: None, rope_factor: None, json_output: false, otlp_endpoint: None, cors_allow_origin: [], watermark_gamma: None, watermark_delta: None, ngrok: false, ngrok_authtoken: None, ngrok_edge: None, tokenizer_config_path: None, disable_grammar_support: false, env: false }
+
+    # Docker options
+    image: str = "ghcr.io/huggingface/text-generation-inference:latest"
+    # Launcher options
+    model_id: str = "gpt2"
+    revision: str = "main"
+    dtype: Optional[DType_Literal] = None
+    quantize: Optional[Quantize_Literal] = None
+    sharded: Optional[bool] = None
+    num_shard: Optional[int] = None
+    trust_remote_code: Optional[bool] = None
+    disable_custom_kernels: Optional[bool] = None
+    # Inference options
+    max_best_of: Optional[int] = None
+    max_concurrent_requests: Optional[int] = None
+    max_stop_sequences: Optional[int] = None
+    max_top_n_tokens: Optional[int] = None
+    max_input_length: Optional[int] = None
+    max_total_tokens: Optional[int] = None
+    waiting_served_ratio: Optional[float] = None
+    max_batch_prefill_tokens: Optional[int] = None
+    max_batch_total_tokens: Optional[int] = None
+    max_waiting_tokens: Optional[int] = None
+    max_batch_size: Optional[int] = None
+    enable_cuda_graphs: Optional[bool] = None
+    huggingface_hub_cache: Optional[str] = None
+    weights_cache_override: Optional[str] = None
+    cuda_memory_fraction: Optional[float] = None
+    rope_scaling: Optional[str] = None
+    rope_factor: Optional[str] = None
+    json_output: Optional[bool] = None
+    otlp_endpoint: Optional[str] = None
+    cors_allow_origin: Optional[list] = None
+    watermark_gamma: Optional[str] = None
+    watermark_delta: Optional[str] = None
+    tokenizer_config_path: Optional[str] = None
+    disable_grammar_support: Optional[bool] = None
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+
+
+class TGI(DockerInferenceServer):
+    NAME: str = "Text-Generation-Inference"
+    SUCCESS_SENTINEL: str = "Connected"
+    FAILURE_SENTINEL: str = "Error"
+
+    def __init__(self, config: TGIConfig) -> None:
+        super().__init__(config)
+
+    async def single_client_call(self, prompt: str, **kwargs) -> str:
+        output = await self.client.text_generation(prompt=prompt, **kwargs)
+        return output
+
+    async def batch_client_call(self, prompt: list, **kwargs) -> list:
+        output = await asyncio.gather(*[self.single_client_call(prompt=p, **kwargs) for p in prompt])
+        return output
+
+    def generate(self, prompt: Union[str, list], **kwargs) -> Union[str, list]:
+        if isinstance(prompt, str):
+            output = asyncio.run(self.single_client_call(prompt, **kwargs))
+            return output
+        elif isinstance(prompt, list):
+            output = asyncio.run(self.batch_client_call(prompt, **kwargs))
+            return output
+        else:
+            raise ValueError(f"Unsupported input type: {type(prompt)}")
diff --git a/py_tgi/utils.py b/py_txi/utils.py
similarity index 67%
rename from py_tgi/utils.py
rename to py_txi/utils.py
index eeacc70..473d042 100644
--- a/py_tgi/utils.py
+++ b/py_txi/utils.py
@@ -1,8 +1,11 @@
-import os
+import socket
 import subprocess
 
-HF_CACHE_DIR = os.path.expanduser("~/.cache/huggingface/hub")
-CONNECTION_TIMEOUT = 60
+
+def get_free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
 
 
 def is_rocm_system() -> bool:
diff --git a/setup.py b/setup.py
index 11b3320..d995654 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,6 @@
     version=PY_TGI_VERSION,
     packages=find_packages(),
     install_requires=["docker", "huggingface-hub", "numpy"],
-    extras_require={"quality": ["ruff"]},
+    extras_require={"quality": ["ruff"], "testing": ["pytest"]},
     **common_setup_kwargs,
 )
diff --git a/tests/test.py b/tests/test.py
index d55f119..573b109 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -1,15 +1,20 @@
 import numpy as np
 
-from py_tgi import TEI, TGI
-
-embed = TEI(model="bert-base-uncased", dtype="float16", pooling="mean", port=1234)
-output = embed.encode("Hi, I'm a language model")
-assert isinstance(output, np.ndarray)
-output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"])
-assert isinstance(output, list) and all(isinstance(x, np.ndarray) for x in output)
-
-llm = TGI(model="gpt2", sharded=False, port=4321)
-output = llm.generate("Hi, I'm a sanity test")
-assert isinstance(output, str)
-output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"])
-assert isinstance(output, list) and all(isinstance(x, str) for x in output)
+from py_txi.text_embedding_inference import TEI, TEIConfig
+from py_txi.text_generation_inference import TGI, TGIConfig
+
+
+def test_tei():
+    embed = TEI(config=TEIConfig(pooling="cls", gpus=1))
+    output = embed.encode("Hi, I'm a language model")
+    assert isinstance(output, np.ndarray)
+    output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"])
+    assert isinstance(output, list) and all(isinstance(x, np.ndarray) for x in output)
+
+
+def test_tgi():
+    llm = TGI(config=TGIConfig(dtype="float16", gpus=1))
+    output = llm.generate("Hi, I'm a sanity test")
+    assert isinstance(output, str)
+    output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"])
+    assert isinstance(output, list) and all(isinstance(x, str) for x in output)

From 54f513277b67f7a157c5a0a6897c6f067912f8ac Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 1 Mar 2024 06:56:14 +0100
Subject: [PATCH 2/9] fix tests and simplify examples

---
 example.py                        | 13 ++-----------
 py_txi/__init__.py                |  6 +++---
 py_txi/docker_inference_server.py |  4 +---
 tests/{test.py => test_txi.py}    |  0
 4 files changed, 6 insertions(+), 17 deletions(-)
 rename tests/{test.py => test_txi.py} (100%)

diff --git a/example.py b/example.py
index 42f6557..50a6033 100644
--- a/example.py
+++ b/example.py
@@ -1,21 +1,12 @@
 from py_txi.text_embedding_inference import TEI, TEIConfig
 from py_txi.text_generation_inference import TGI, TGIConfig
-from py_txi.utils import get_free_port
 
-port = get_free_port()
-ports = {"80/tcp": ("127.0.0.1", port)}
-
-tei_config = TEIConfig(pooling="cls", ports=ports)
-embed = TEI(tei_config)
+embed = TEI(config=TEIConfig(pooling="cls"))
 output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"])
 print("Embed:", output)
 embed.close()
 
-port = get_free_port()
-ports = {"80/tcp": ("127.0.0.1", port)}
-
-tgi_config = TGIConfig(ports=ports)
-llm = TGI(tgi_config)
+llm = TGI(config=TGIConfig(sharded=False))
 output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"])
 print("LLM:", output)
 llm.close()
diff --git a/py_txi/__init__.py b/py_txi/__init__.py
index 16c2c4f..1001649 100644
--- a/py_txi/__init__.py
+++ b/py_txi/__init__.py
@@ -1,3 +1,3 @@
-# from .text_embedding_inference import TEI  # noqa
-# from .text_generation_inference import TGI  # noqa
-# from .utils import is_nvidia_system, is_rocm_system  # noqa
+from .text_embedding_inference import TEI  # noqa
+from .text_generation_inference import TGI  # noqa
+from .utils import is_nvidia_system, is_rocm_system, get_free_port  # noqa
diff --git a/py_txi/docker_inference_server.py b/py_txi/docker_inference_server.py
index bd8302e..a233b4e 100644
--- a/py_txi/docker_inference_server.py
+++ b/py_txi/docker_inference_server.py
@@ -150,13 +150,11 @@ def close(self) -> None:
             LOGGER.info("\t+ Stoping Docker container")
             self.container.stop()
             self.container.wait()
-            del self.container
             LOGGER.info("\t+ Docker container stopped")
+            del self.container
 
         if hasattr(self, "client"):
-            LOGGER.info("\t+ Stoping Inference client")
             del self.client
-            LOGGER.info("\t+ Inference client stopped")
 
     def __del__(self) -> None:
         self.close()
diff --git a/tests/test.py b/tests/test_txi.py
similarity index 100%
rename from tests/test.py
rename to tests/test_txi.py

From a1b8e4a7576e83bfad11f86e49f8161643e86dee Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 1 Mar 2024 07:00:41 +0100
Subject: [PATCH 3/9] fix test

---
 tests/test_txi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_txi.py b/tests/test_txi.py
index 573b109..fecc525 100644
--- a/tests/test_txi.py
+++ b/tests/test_txi.py
@@ -5,7 +5,7 @@
 
 
 def test_tei():
-    embed = TEI(config=TEIConfig(pooling="cls", gpus=1))
+    embed = TEI(config=TEIConfig(pooling="cls"))
     output = embed.encode("Hi, I'm a language model")
     assert isinstance(output, np.ndarray)
     output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"])
@@ -13,7 +13,7 @@ def test_tei():
 
 
 def test_tgi():
-    llm = TGI(config=TGIConfig(dtype="float16", gpus=1))
+    llm = TGI(config=TGIConfig(dtype="float16"))
     output = llm.generate("Hi, I'm a sanity test")
     assert isinstance(output, str)
     output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"])

From a45487d71809978c543c6be9d7c8bb75b8b56df0 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 1 Mar 2024 07:13:35 +0100
Subject: [PATCH 4/9] added aiohttp

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d995654..d9966b1 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@
     name="py-tgi",
     version=PY_TGI_VERSION,
     packages=find_packages(),
-    install_requires=["docker", "huggingface-hub", "numpy"],
+    install_requires=["docker", "huggingface-hub", "numpy", "aiohttp"],
     extras_require={"quality": ["ruff"], "testing": ["pytest"]},
     **common_setup_kwargs,
 )

From 31a5e2f86698668d80b98bdab6128d6f050022d4 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 1 Mar 2024 07:28:01 +0100
Subject: [PATCH 5/9] fix

---
 example.py                          | 2 +-
 py_txi/text_generation_inference.py | 4 ++--
 tests/test_txi.py                   | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/example.py b/example.py
index 50a6033..20b6c43 100644
--- a/example.py
+++ b/example.py
@@ -6,7 +6,7 @@
 print("Embed:", output)
 embed.close()
 
-llm = TGI(config=TGIConfig(sharded=False))
+llm = TGI(config=TGIConfig(sharded="false"))
 output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"])
 print("LLM:", output)
 llm.close()
diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py
index 10bad10..1c9a314 100644
--- a/py_txi/text_generation_inference.py
+++ b/py_txi/text_generation_inference.py
@@ -7,7 +7,7 @@
 
 LOGGER = getLogger("TGI")
 
-
+Shareded_Literal = Literal["true", "false"]
 DType_Literal = Literal["float32", "float16", "bfloat16"]
 Quantize_Literal = Literal["bitsandbytes-nf4", "bitsandbytes-fp4", "gptq"]
 
@@ -23,7 +23,7 @@ class TGIConfig(DockerInferenceServerConfig):
     revision: str = "main"
     dtype: Optional[DType_Literal] = None
     quantize: Optional[Quantize_Literal] = None
-    sharded: Optional[bool] = None
+    sharded: Optional[Shareded_Literal] = None
     num_shard: Optional[int] = None
     trust_remote_code: Optional[bool] = None
     disable_custom_kernels: Optional[bool] = None
diff --git a/tests/test_txi.py b/tests/test_txi.py
index fecc525..96efbde 100644
--- a/tests/test_txi.py
+++ b/tests/test_txi.py
@@ -10,6 +10,7 @@ def test_tei():
     assert isinstance(output, np.ndarray)
     output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"])
     assert isinstance(output, list) and all(isinstance(x, np.ndarray) for x in output)
+    embed.close()
 
 
 def test_tgi():
@@ -18,3 +19,4 @@ def test_tgi():
     assert isinstance(output, str)
     output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"])
     assert isinstance(output, list) and all(isinstance(x, str) for x in output)
+    llm.close()

From cb9e31ed617522fe55042b516d68dbb5bda67402 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 1 Mar 2024 07:42:30 +0100
Subject: [PATCH 6/9] fix fr

---
 py_txi/text_generation_inference.py | 2 --
 tests/test_txi.py                   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py
index 1c9a314..e6d0625 100644
--- a/py_txi/text_generation_inference.py
+++ b/py_txi/text_generation_inference.py
@@ -14,8 +14,6 @@
 
 @dataclass
 class TGIConfig(DockerInferenceServerConfig):
-    # { model_id: "gpt2", revision: Some("main"), validation_workers: 2, sharded: None, num_shard: None, quantize: None, speculate: None, dtype: None, trust_remote_code: false, max_concurrent_requests: 128, max_best_of: 2, max_stop_sequences: 4, max_top_n_tokens: 5, max_input_length: 1024, max_total_tokens: 2048, waiting_served_ratio: 1.2, max_batch_prefill_tokens: 4096, max_batch_total_tokens: None, max_waiting_tokens: 20, max_batch_size: None, enable_cuda_graphs: false, hostname: "6fedb07983ae", port: 80, shard_uds_path: "/tmp/text-generation-server", master_addr: "localhost", master_port: 29500, huggingface_hub_cache: Some("/data"), weights_cache_override: None, disable_custom_kernels: false, cuda_memory_fraction: 1.0, rope_scaling: None, rope_factor: None, json_output: false, otlp_endpoint: None, cors_allow_origin: [], watermark_gamma: None, watermark_delta: None, ngrok: false, ngrok_authtoken: None, ngrok_edge: None, tokenizer_config_path: None, disable_grammar_support: false, env: false }
-
     # Docker options
     image: str = "ghcr.io/huggingface/text-generation-inference:latest"
     # Launcher options
diff --git a/tests/test_txi.py b/tests/test_txi.py
index 96efbde..baa9c36 100644
--- a/tests/test_txi.py
+++ b/tests/test_txi.py
@@ -14,7 +14,7 @@ def test_tei():
 
 
 def test_tgi():
-    llm = TGI(config=TGIConfig(dtype="float16"))
+    llm = TGI(config=TGIConfig(sharded="false"))
     output = llm.generate("Hi, I'm a sanity test")
     assert isinstance(output, str)
     output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"])

From 17d7a4dda6a6eb5440b9a27234ca17bd966e3af6 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 1 Mar 2024 07:49:45 +0100
Subject: [PATCH 7/9] update stuff

---
 .github/workflows/{quality_checks.yaml => quality.yaml} | 8 ++++----
 .github/workflows/{tests.yaml => test.yaml}             | 6 +++---
 Makefile                                                | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)
 rename .github/workflows/{quality_checks.yaml => quality.yaml} (86%)
 rename .github/workflows/{tests.yaml => test.yaml} (88%)

diff --git a/.github/workflows/quality_checks.yaml b/.github/workflows/quality.yaml
similarity index 86%
rename from .github/workflows/quality_checks.yaml
rename to .github/workflows/quality.yaml
index a7bc11e..86fe496 100644
--- a/.github/workflows/quality_checks.yaml
+++ b/.github/workflows/quality.yaml
@@ -1,4 +1,4 @@
-name: quality checks
+name: quality
 
 on:
   push:
@@ -13,10 +13,10 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  run_quality_checks:
+  check_quality:
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout
+      - name: Checkout code
         uses: actions/checkout@v3
 
       - name: Set up Python 3.10
@@ -29,6 +29,6 @@ jobs:
           pip install --upgrade pip
           pip install -e .[quality]
 
-      - name: Check style
+      - name: Check quality
         run: |
           make quality
diff --git a/.github/workflows/tests.yaml b/.github/workflows/test.yaml
similarity index 88%
rename from .github/workflows/tests.yaml
rename to .github/workflows/test.yaml
index 0f73035..c29376a 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/test.yaml
@@ -1,4 +1,4 @@
-name: tests
+name: test
 
 on:
   push:
@@ -24,11 +24,11 @@ jobs:
         with:
           python-version: "3.10"
 
-      - name: Install requirements
+      - name: Install testing requirements
         run: |
           pip install --upgrade pip
           pip install -e .[testing]
 
       - name: Run test
         run: |
-          pytest tests/
+          make test
diff --git a/Makefile b/Makefile
index 9ff0121..493340b 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ style:
 	ruff check --fix .
 
 test:
-	python tests/test.py
+	pytest tests/ -x
 
 install:
 	pip install -e .
\ No newline at end of file

From 75855a73c3e773a7914460902c9499ce6d0a7f91 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <ilyas.moutawwakil@gmail.com>
Date: Tue, 5 Mar 2024 03:15:15 +0000
Subject: [PATCH 8/9] auto rocm

---
 py_txi/text_embedding_inference.py  | 7 +++++++
 py_txi/text_generation_inference.py | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/py_txi/text_embedding_inference.py b/py_txi/text_embedding_inference.py
index 38e19d1..86ba5c0 100644
--- a/py_txi/text_embedding_inference.py
+++ b/py_txi/text_embedding_inference.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig
+from .utils import is_nvidia_system
 
 LOGGER = getLogger("TEI")
 
@@ -28,6 +29,12 @@ class TEIConfig(DockerInferenceServerConfig):
     def __post_init__(self) -> None:
         super().__post_init__()
 
+        if is_nvidia_system() and "cpu" in self.image:
+            LOGGER.warning(
+                "Your system has NVIDIA GPU, but you are using a CPU image."
+                "Consider using a GPU image for better performance."
+            )
+
 
 class TEI(DockerInferenceServer):
     NAME: str = "Text-Embedding-Inference"
diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py
index e6d0625..f297bd0 100644
--- a/py_txi/text_generation_inference.py
+++ b/py_txi/text_generation_inference.py
@@ -4,6 +4,7 @@
 from typing import Literal, Optional, Union
 
 from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig
+from .utils import is_rocm_system
 
 LOGGER = getLogger("TGI")
 
@@ -54,6 +55,13 @@ class TGIConfig(DockerInferenceServerConfig):
     def __post_init__(self) -> None:
         super().__post_init__()
 
+        if is_rocm_system() and "rocm" not in self.image:
+            LOGGER.warning(
+                "You are running on a ROCm system but the image is not rocm specific. "
+                "Add 'rocm' to the image name to use the rocm specific image."
+            )
+            self.image += "-rocm"
+
 
 class TGI(DockerInferenceServer):
     NAME: str = "Text-Generation-Inference"

From c59156210b53c14dfefda9ae0175d559cffaaab7 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <ilyas.moutawwakil@gmail.com>
Date: Tue, 5 Mar 2024 03:15:24 +0000
Subject: [PATCH 9/9] update readme and setup

---
 README.md | 47 ++++++++++++++++++++---------------------------
 setup.py  |  8 ++++----
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 76553d1..63ff32a 100644
--- a/README.md
+++ b/README.md
@@ -1,51 +1,44 @@
-# Py-TGI (Py-TXI at this point xD)
+# Py-TXI (previously Py-TGI)
 
-[![PyPI version](https://badge.fury.io/py/py-tgi.svg)](https://badge.fury.io/py/py-tgi)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/py-tgi)](https://pypi.org/project/py-tgi/)
-[![PyPI - Format](https://img.shields.io/pypi/format/py-tgi)](https://pypi.org/project/py-tgi/)
-[![Downloads](https://pepy.tech/badge/py-tgi)](https://pepy.tech/project/py-tgi)
-[![PyPI - License](https://img.shields.io/pypi/l/py-tgi)](https://pypi.org/project/py-tgi/)
-[![Tests](https://github.com/IlyasMoutawwakil/py-tgi/actions/workflows/tests.yaml/badge.svg)](https://github.com/IlyasMoutawwakil/py-tgi/actions/workflows/tests.yaml)
+[![PyPI version](https://badge.fury.io/py/py-txi.svg)](https://badge.fury.io/py/py-txi)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/py-txi)](https://pypi.org/project/py-txi/)
+[![PyPI - Format](https://img.shields.io/pypi/format/py-txi)](https://pypi.org/project/py-txi/)
+[![Downloads](https://pepy.tech/badge/py-txi)](https://pepy.tech/project/py-txi)
+[![PyPI - License](https://img.shields.io/pypi/l/py-txi)](https://pypi.org/project/py-txi/)
+[![Tests](https://github.com/IlyasMoutawwakil/py-txi/actions/workflows/tests.yaml/badge.svg)](https://github.com/IlyasMoutawwakil/py-txi/actions/workflows/tests.yaml)
 
-Py-TGI is a Python wrapper around [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference) and [Text-Embedding-Inference](https://github.com/huggingface/text-embeddings-inference) that enables creating and running TGI/TEI instances through the awesome `docker-py` in a similar style to Transformers API.
+Py-TXI is a Python wrapper around [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference) and [Text-Embedding-Inference](https://github.com/huggingface/text-embeddings-inference) that enables creating and running TGI/TEI instances through the awesome `docker-py` in a similar style to Transformers API.
 
 ## Installation
 
 ```bash
-pip install py-tgi
+pip install py-txi
 ```
 
-Py-TGI is designed to be used in a similar way to Transformers API. We use `docker-py` (instead of a dirty `subprocess` solution) so that the containers you run are linked to the main process and are stopped automatically when your code finishes or fails.
+Py-TXI is designed to be used in a similar way to Transformers API. We use `docker-py` (instead of a dirty `subprocess` solution) so that the containers you run are linked to the main process and are stopped automatically when your code finishes or fails.
 
 ## Usage
 
 Here's an example of how to use it:
 
 ```python
-from py_tgi import TGI, is_nvidia_system, is_rocm_system
+from py_txi import TGI, is_nvidia_system, is_rocm_system
 
-llm = TGI(
-    model="NousResearch/Llama-2-7b-hf",
-    devices=["/dev/kfd", "/dev/dri"] if is_rocm_system() else None,
-    gpus="all" if is_nvidia_system() else None,
-)
+llm = TGI(config=TGIConfig(sharded="false"))
 output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"])
-print(output)
+print("LLM:", output)
+llm.close()
 ```
 
-Output: ```[" and I'm here to help you with any questions you have. What can I help you with", "\nUser 0: I'm doing well, thanks for asking. I'm just a"]```
+Output: ```LLM: ["er. I'm a language modeler. I'm a language modeler. I'm a language", " I'm fine, how are you? I'm fine, how are you? I'm fine,"]```
 
 ```python
-from py_tgi import TEI, is_nvidia_system
-
-embed = TEI(
-    model="BAAI/bge-large-en-v1.5",
-    dtype="float16",
-    pooling="mean",
-    gpus="all" if is_nvidia_system() else None,
-)
+from py_txi import TEI, is_nvidia_system
+
+embed = TEI(config=TEIConfig(pooling="cls"))
 output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"])
-print(output)
+print("Embed:", output)
+embed.close()
 ```
 
 Output: ```[array([[ 0.01058742, -0.01588806, -0.03487622, ..., -0.01613717,
diff --git a/setup.py b/setup.py
index d9966b1..f283131 100644
--- a/setup.py
+++ b/setup.py
@@ -2,14 +2,14 @@
 
 from setuptools import find_packages, setup
 
-PY_TGI_VERSION = "0.2.0"
+PY_TXI_VERSION = "0.4.0"
 
 common_setup_kwargs = {
     "author": "Ilyas Moutawwakil",
     "author_email": "ilyas.moutawwakil@gmail.com",
     "description": "A Python wrapper around TGI and TEI servers",
     "keywords": ["tgi", "llm", "tei", "embedding", "huggingface", "docker", "python"],
-    "url": "https://github.com/IlyasMoutawwakil/py-tgi",
+    "url": "https://github.com/IlyasMoutawwakil/py-txi",
     "long_description_content_type": "text/markdown",
     "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
     "platforms": ["linux", "windows", "macos"],
@@ -21,8 +21,8 @@
 
 
 setup(
-    name="py-tgi",
-    version=PY_TGI_VERSION,
+    name="py-txi",
+    version=PY_TXI_VERSION,
     packages=find_packages(),
     install_requires=["docker", "huggingface-hub", "numpy", "aiohttp"],
     extras_require={"quality": ["ruff"], "testing": ["pytest"]},