From 196f07aa874eaf51bbc0ac531f2930da8c6577fa Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 1 Mar 2024 06:42:26 +0100 Subject: [PATCH 1/9] refacored + syncio --- .github/workflows/quality_checks.yaml | 34 ++++++ .github/workflows/release.yaml | 2 +- .github/workflows/tests.yaml | 5 +- example.py | 31 ++--- py_tgi/__init__.py | 3 - py_tgi/inference_server.py | 143 ----------------------- py_tgi/text_embedding_inference.py | 144 ----------------------- py_tgi/text_generation_inference.py | 144 ----------------------- py_txi/__init__.py | 3 + py_txi/docker_inference_server.py | 162 ++++++++++++++++++++++++++ py_txi/text_embedding_inference.py | 56 +++++++++ py_txi/text_generation_inference.py | 84 +++++++++++++ {py_tgi => py_txi}/utils.py | 9 +- setup.py | 2 +- tests/test.py | 31 ++--- 15 files changed, 384 insertions(+), 469 deletions(-) create mode 100644 .github/workflows/quality_checks.yaml delete mode 100644 py_tgi/__init__.py delete mode 100644 py_tgi/inference_server.py delete mode 100644 py_tgi/text_embedding_inference.py delete mode 100644 py_tgi/text_generation_inference.py create mode 100644 py_txi/__init__.py create mode 100644 py_txi/docker_inference_server.py create mode 100644 py_txi/text_embedding_inference.py create mode 100644 py_txi/text_generation_inference.py rename {py_tgi => py_txi}/utils.py (67%) diff --git a/.github/workflows/quality_checks.yaml b/.github/workflows/quality_checks.yaml new file mode 100644 index 0000000..a7bc11e --- /dev/null +++ b/.github/workflows/quality_checks.yaml @@ -0,0 +1,34 @@ +name: quality checks + +on: + push: + branches: + - main + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + run_quality_checks: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + + - name: Install quality requirements + run: | + pip install --upgrade pip + pip install -e .[quality] + + - name: Check style + run: | + make quality diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 25b596f..6b86b5e 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -5,7 +5,7 @@ on: types: [created] jobs: - deploy: + release: runs-on: ubuntu-latest steps: - name: Checkout code diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index f6b283b..0f73035 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -27,7 +27,8 @@ jobs: - name: Install requirements run: | pip install --upgrade pip - pip install -e . + pip install -e .[testing] - name: Run test - run: python tests/test.py + run: | + pytest tests/ diff --git a/example.py b/example.py index 1c63620..42f6557 100644 --- a/example.py +++ b/example.py @@ -1,20 +1,21 @@ -from py_tgi import TEI, TGI, is_nvidia_system, is_rocm_system +from py_txi.text_embedding_inference import TEI, TEIConfig +from py_txi.text_generation_inference import TGI, TGIConfig +from py_txi.utils import get_free_port -if is_nvidia_system(): - llm = TGI(model="NousResearch/Llama-2-7b-hf", gpus="all", port=1234) -elif is_rocm_system(): - llm = TGI(model="NousResearch/Llama-2-7b-hf", devices=["/dev/kfd", "/dev/dri"], port=1234) -else: - llm = TGI(model="NousResearch/Llama-2-7b-hf", port=1234) +port = get_free_port() +ports = {"80/tcp": ("127.0.0.1", port)} +tei_config = TEIConfig(pooling="cls", ports=ports) +embed = TEI(tei_config) +output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"]) +print("Embed:", output) +embed.close() +port = get_free_port() +ports = {"80/tcp": ("127.0.0.1", port)} + +tgi_config = TGIConfig(ports=ports) +llm = TGI(tgi_config) output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"]) print("LLM:", output) - -if is_nvidia_system(): - embed = TEI(model="BAAI/bge-large-en-v1.5", dtype="float16", pooling="mean", gpus="all", port=4321) -else: - embed = TEI(model="BAAI/bge-large-en-v1.5", dtype="float16", pooling="mean", port=4321) - -output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"]) -print("Embed:", output) +llm.close() diff --git a/py_tgi/__init__.py b/py_tgi/__init__.py deleted file mode 100644 index 870eedd..0000000 --- a/py_tgi/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .text_embedding_inference import TEI # noqa -from .text_generation_inference import TGI # noqa -from .utils import is_nvidia_system, is_rocm_system # noqa diff --git a/py_tgi/inference_server.py b/py_tgi/inference_server.py deleted file mode 100644 index 95eb817..0000000 --- a/py_tgi/inference_server.py +++ /dev/null @@ -1,143 +0,0 @@ -import os -import re -from abc import ABC -from logging import INFO, basicConfig, getLogger -from typing import Any, Dict, List, Optional, Union - -import docker -import docker.errors -import docker.types - -from .utils import HF_CACHE_DIR - -basicConfig(level=INFO) - - -DOCKER = docker.from_env() -LOGGER = getLogger("inference-server") - - -class InferenceServer(ABC): - NAME: str = "Inference-Server" - - def __init__( - self, - # model options - model: str, - revision: str, - # image options - image: str, - # docker options - port: int = 1111, - shm_size: str = "1g", - address: str = "127.0.0.1", - volumes: Dict[str, Any] = {HF_CACHE_DIR: "/data"}, # connects local hf cache to /data folder - gpus: Optional[Union[str, int]] = None, # e.g. "all" or "0,1,2,3" or 4 for NVIDIA - devices: Optional[List[str]] = None, # e.g. ["/dev/kfd", "/dev/dri"] for ROCm - # launcher options - **kwargs, - ) -> None: - # model options - self.model = model - self.revision = revision - # docker options - self.port = port - self.image = image - self.volumes = volumes - self.address = address - self.shm_size = shm_size - # device options - self.gpus = gpus - self.devices = devices - - try: - LOGGER.info(f"\t+ Checking if {self.NAME} image is available locally") - DOCKER.images.get(self.image) - LOGGER.info(f"\t+ {self.NAME} image found locally") - except docker.errors.ImageNotFound: - LOGGER.info(f"\t+ {self.NAME} image not found locally, pulling from Docker Hub") - DOCKER.images.pull(self.image) - - LOGGER.info(f"\t+ Building {self.NAME} URL") - self.build_url() - - LOGGER.info(f"\t+ Building {self.NAME} environment") - self.build_env() - - LOGGER.info(f"\t+ Building {self.NAME} devices") - self.build_devices() - - LOGGER.info(f"\t+ Building {self.NAME} command") - self.build_command() - - LOGGER.info(f"\t+ Running {self.NAME} server") - self.run_container() - - LOGGER.info(f"\t+ Waiting for {self.NAME} server to be ready") - self.wait() - - LOGGER.info(f"\t+ Connecting to {self.NAME} server") - self.connect_client() - - def run_container(self): - self.container = DOCKER.containers.run( - image=self.image, - command=self.command, - shm_size=self.shm_size, - ports={"80/tcp": (self.address, self.port)}, - volumes={source: {"bind": target, "mode": "rw"} for source, target in self.volumes.items()}, - device_requests=self.device_requests, - devices=self.devices, - environment=self.env, - auto_remove=True, - detach=True, - ) - - def wait(self): - raise NotImplementedError - - def connect_client(self): - raise NotImplementedError - - def build_devices(self): - if self.gpus is not None and isinstance(self.gpus, str) and self.gpus == "all": - LOGGER.info("\t+ Using all GPU(s)") - self.device_requests = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] - elif self.gpus is not None and isinstance(self.gpus, int): - LOGGER.info(f"\t+ Using {self.gpus} GPU(s)") - self.device_requests = [docker.types.DeviceRequest(count=self.gpus, capabilities=[["gpu"]])] - elif self.gpus is not None and isinstance(self.gpus, str) and re.match(r"^\d+(,\d+)*$", self.gpus): - LOGGER.info(f"\t+ Using GPU(s) {self.gpus}") - self.device_requests = [docker.types.DeviceRequest(device_ids=[self.gpus], capabilities=[["gpu"]])] - else: - LOGGER.info("\t+ Not using any GPU(s)") - self.device_requests = None - - if self.devices is not None and isinstance(self.devices, list) and all(os.path.exists(d) for d in self.devices): - LOGGER.info(f"\t+ Using custom device(s) {self.devices}") - self.devices = self.devices - else: - LOGGER.info("\t+ Not using any custom device(s)") - self.devices = None - - def build_url(self): - self.url = f"http://{self.address}:{self.port}" - - def build_env(self): - self.env = {} - - def build_command(self): - self.command = [] - - def close(self) -> None: - if hasattr(self, "container"): - LOGGER.info("\t+ Stoping Docker container") - self.container.stop() - self.container.wait() - LOGGER.info("\t+ Docker container stopped") - - def __del__(self): - try: - self.close() - except Exception: - pass diff --git a/py_tgi/text_embedding_inference.py b/py_tgi/text_embedding_inference.py deleted file mode 100644 index 42f2c41..0000000 --- a/py_tgi/text_embedding_inference.py +++ /dev/null @@ -1,144 +0,0 @@ -import os -import time -from concurrent.futures import ThreadPoolExecutor -from logging import getLogger -from typing import Any, Dict, List, Literal, Optional, Union - -import numpy as np -from huggingface_hub import InferenceClient - -from .inference_server import InferenceServer -from .utils import CONNECTION_TIMEOUT, HF_CACHE_DIR - -LOGGER = getLogger("TEI") - - -Pooling_Literal = Literal["cls", "mean"] -DType_Literal = Literal["float32", "float16"] - - -class TEI(InferenceServer): - NAME: str = "Text-Embedding-Inference" - - def __init__( - self, - # model options - model: str, - revision: str = "main", - # image options - image: str = "ghcr.io/huggingface/text-embeddings-inference:latest", - # docker options - port: int = 1111, - shm_size: str = "1g", - address: str = "127.0.0.1", - volumes: Dict[str, Any] = {HF_CACHE_DIR: "/data"}, # connects local hf cache to /data folder - devices: Optional[List[str]] = None, # e.g. ["/dev/kfd", "/dev/dri"] for ROCm - gpus: Optional[Union[str, int]] = None, # e.g. "all" or "0,1,2,3" or 4 for NVIDIA - # launcher options - # tgi launcher options - dtype: Optional[DType_Literal] = None, - pooling: Optional[Pooling_Literal] = None, - tokenization_workers: Optional[int] = None, - max_concurrent_requests: Optional[int] = None, - max_batch_tokens: Optional[int] = None, - max_batch_requests: Optional[int] = None, - max_client_batch_size: Optional[int] = None, - ) -> None: - # tgi launcher options - self.dtype = dtype - self.pooling = pooling - self.tokenization_workers = tokenization_workers - self.max_concurrent_requests = max_concurrent_requests - self.max_batch_tokens = max_batch_tokens - self.max_batch_requests = max_batch_requests - self.max_client_batch_size = max_client_batch_size - - if gpus is None and "cpu-" not in image: - LOGGER.warning("No GPUs were specified, but the image does not contain 'cpu-'. Adding it.") - image_, tag_ = image.split(":") - image = f"{image_}:cpu-{tag_}" - - super().__init__( - model=model, - revision=revision, - image=image, - port=port, - shm_size=shm_size, - address=address, - volumes=volumes, - devices=devices, - gpus=gpus, - ) - - def wait(self): - for line in self.container.logs(stream=True): - log = line.decode("utf-8").strip() - if "Ready" in log: - LOGGER.info(f"\t {log}") - break - elif "Error" in log: - LOGGER.info(f"\t {log}") - raise Exception(f"{self.NAME} server failed to start") - else: - LOGGER.info(f"\t {log}") - - def connect_client(self): - start_time = time.time() - while time.time() - start_time < CONNECTION_TIMEOUT: - try: - self.client = InferenceClient(model=self.url) - self.client.feature_extraction("Hello world!") - LOGGER.info(f"\t+ Connected to {self.NAME} server successfully") - return - except Exception: - LOGGER.info(f"\t+ {self.NAME} server is not ready yet, waiting 1 second") - time.sleep(1) - - raise Exception(f"{self.NAME} server took too long to start (60 seconds)") - - def build_command(self): - self.command = ["--model-id", self.model, "--revision", self.revision] - if self.dtype: - self.command.extend(["--dtype", self.dtype]) - if self.pooling: - self.command.extend(["--pooling", self.pooling]) - if self.tokenization_workers: - self.command.extend(["--tokenization-workers", str(self.tokenization_workers)]) - if self.max_concurrent_requests: - self.command.extend(["--max-concurrent-requests", str(self.max_concurrent_requests)]) - if self.max_batch_tokens: - self.command.extend(["--max-batch-tokens", str(self.max_batch_tokens)]) - if self.max_batch_requests: - self.command.extend(["--max-batch-requests", str(self.max_batch_requests)]) - if self.max_client_batch_size: - self.command.extend(["--max-client-batch-size", str(self.max_client_batch_size)]) - - def build_env(self): - self.env = {} - if os.environ.get("HUGGING_FACE_HUB_TOKEN", None) is not None: - self.env["HUGGING_FACE_HUB_TOKEN"] = os.environ["HUGGING_FACE_HUB_TOKEN"] - - @classmethod - def from_pretrained(cls, *args, **kwargs): - return cls(*args, **kwargs) - - def encode(self, text: Union[str, List[str]], **kwargs) -> Union[np.ndarray, List[np.ndarray]]: - if isinstance(text, str): - output = self.client.feature_extraction(text=text, **kwargs) - return output - - elif isinstance(text, list): - outputs = [] - - with ThreadPoolExecutor(max_workers=len(text)) as executor: - futures = [ - executor.submit(self.client.feature_extraction, text=text[i], **kwargs) for i in range(len(text)) - ] - - for i in range(len(text)): - outputs.append(futures[i].result()) - - return outputs - - def __call__(self, text: Union[str, List[str]], **kwargs) -> Union[np.ndarray, List[np.ndarray]]: - return self.encode(text, **kwargs) diff --git a/py_tgi/text_generation_inference.py b/py_tgi/text_generation_inference.py deleted file mode 100644 index 0e3cf94..0000000 --- a/py_tgi/text_generation_inference.py +++ /dev/null @@ -1,144 +0,0 @@ -import os -import time -from concurrent.futures import ThreadPoolExecutor -from logging import getLogger -from typing import Any, Dict, List, Literal, Optional, Union - -from huggingface_hub import InferenceClient -from huggingface_hub.inference._text_generation import TextGenerationResponse - -from .inference_server import InferenceServer -from .utils import CONNECTION_TIMEOUT, HF_CACHE_DIR, is_rocm_system - -LOGGER = getLogger("TGI") - - -DType_Literal = Literal["float32", "float16", "bfloat16"] -Quantize_Literal = Literal["bitsandbytes-nf4", "bitsandbytes-fp4", "gptq"] - - -class TGI(InferenceServer): - NAME: str = "Text-Generation-Inference" - - def __init__( - self, - # model options - model: str, - revision: str = "main", - # image options - image: str = "ghcr.io/huggingface/text-generation-inference:latest", - # docker options - port: int = 1111, - shm_size: str = "1g", - address: str = "127.0.0.1", - volumes: Dict[str, Any] = {HF_CACHE_DIR: "/data"}, # connects local hf cache to /data folder - devices: Optional[List[str]] = None, # e.g. ["/dev/kfd", "/dev/dri"] for ROCm - gpus: Optional[Union[str, int]] = None, # e.g. "all" or "0,1,2,3" or 4 for NVIDIA - # launcher options - # tgi launcher options - sharded: Optional[bool] = None, - num_shard: Optional[int] = None, - dtype: Optional[DType_Literal] = None, - quantize: Optional[Quantize_Literal] = None, - trust_remote_code: Optional[bool] = False, - disable_custom_kernels: Optional[bool] = False, - ) -> None: - # tgi launcher options - self.dtype = dtype - self.sharded = sharded - self.quantize = quantize - self.num_shard = num_shard - self.trust_remote_code = trust_remote_code - self.disable_custom_kernels = disable_custom_kernels - - if devices and is_rocm_system() and "-rocm" not in image: - LOGGER.warning("ROCm system detected, but the image does not contain '-rocm'. Adding it.") - image = image + "-rocm" - - super().__init__( - model=model, - revision=revision, - image=image, - port=port, - shm_size=shm_size, - address=address, - volumes=volumes, - devices=devices, - gpus=gpus, - ) - - def wait(self): - for line in self.container.logs(stream=True): - log = line.decode("utf-8").strip() - if "Connected" in log: - LOGGER.info(f"\t {log}") - break - elif "Error" in log: - LOGGER.info(f"\t {log}") - raise Exception(f"{self.NAME} server failed to start") - else: - LOGGER.info(f"\t {log}") - - def connect_client(self): - start_time = time.time() - while time.time() - start_time < CONNECTION_TIMEOUT: - try: - self.client = InferenceClient(model=self.url) - self.client.text_generation("Hello world!") - LOGGER.info(f"\t+ Connected to {self.NAME} server successfully") - return - except Exception: - LOGGER.info(f"\t+ {self.NAME} server is not ready yet, waiting 1 second") - time.sleep(1) - - raise Exception(f"{self.NAME} server took too long to start (60 seconds)") - - def build_command(self): - self.command = ["--model-id", self.model, "--revision", self.revision] - if self.sharded is not None: - self.command.extend(["--sharded", str(self.sharded).lower()]) - if self.num_shard is not None: - self.command.extend(["--num-shard", str(self.num_shard)]) - if self.quantize is not None: - self.command.extend(["--quantize", self.quantize]) - if self.dtype is not None: - self.command.extend(["--dtype", self.dtype]) - - if self.trust_remote_code: - self.command.append("--trust-remote-code") - if self.disable_custom_kernels: - self.command.append("--disable-custom-kernels") - - def build_env(self): - self.env = {} - if os.environ.get("HUGGING_FACE_HUB_TOKEN", None) is not None: - self.env["HUGGING_FACE_HUB_TOKEN"] = os.environ["HUGGING_FACE_HUB_TOKEN"] - - @classmethod - def from_pretrained(cls, *args, **kwargs): - return cls(*args, **kwargs) - - def generate( - self, prompt: Union[str, List[str]], **kwargs - ) -> Union[TextGenerationResponse, List[TextGenerationResponse]]: - if isinstance(prompt, str): - output = self.client.text_generation(prompt=prompt, **kwargs) - return output - - elif isinstance(prompt, list): - outputs = [] - - with ThreadPoolExecutor(max_workers=len(prompt)) as executor: - futures = [ - executor.submit(self.client.text_generation, prompt=prompt[i], **kwargs) for i in range(len(prompt)) - ] - - for i in range(len(prompt)): - outputs.append(futures[i].result()) - - return outputs - - def __call__( - self, prompt: Union[str, List[str]], **kwargs - ) -> Union[TextGenerationResponse, List[TextGenerationResponse]]: - return self.generate(prompt, **kwargs) diff --git a/py_txi/__init__.py b/py_txi/__init__.py new file mode 100644 index 0000000..16c2c4f --- /dev/null +++ b/py_txi/__init__.py @@ -0,0 +1,3 @@ +# from .text_embedding_inference import TEI # noqa +# from .text_generation_inference import TGI # noqa +# from .utils import is_nvidia_system, is_rocm_system # noqa diff --git a/py_txi/docker_inference_server.py b/py_txi/docker_inference_server.py new file mode 100644 index 0000000..bd8302e --- /dev/null +++ b/py_txi/docker_inference_server.py @@ -0,0 +1,162 @@ +import asyncio +import os +import re +import time +from abc import ABC +from dataclasses import asdict, dataclass, field +from logging import INFO, basicConfig, getLogger +from typing import Any, Dict, List, Optional, Union + +import docker +import docker.errors +import docker.types +from huggingface_hub import AsyncInferenceClient + +from .utils import get_free_port + +basicConfig(level=INFO) + +DOCKER = docker.from_env() +LOGGER = getLogger("docker-inference-server") + + +@dataclass +class DockerInferenceServerConfig: + # Image to use for the container + image: str + # Shared memory size for the container + shm_size: str = "1g" + # List of custom devices to forward to the container e.g. ["/dev/kfd", "/dev/dri"] for ROCm + devices: Optional[List[str]] = None + # NVIDIA-docker GPU device options e.g. "all" (all) or "0,1,2,3" (ids) or 4 (count) + gpus: Optional[Union[str, int]] = None + + ports: Dict[str, Any] = field( + default_factory=lambda: {"80/tcp": ("127.0.0.1", 0)}, + metadata={"help": "Dictionary of ports to expose from the container."}, + ) + volumes: Dict[str, Any] = field( + default_factory=lambda: {os.path.expanduser("~/.cache/huggingface/hub"): {"bind": "/data", "mode": "rw"}}, + metadata={"help": "Dictionary of volumes to mount inside the container."}, + ) + environment: Dict[str, str] = field( + default_factory=lambda: {"HUGGINGFACE_HUB_TOKEN": os.environ.get("HUGGINGFACE_HUB_TOKEN", "")}, + metadata={"help": "Dictionary of environment variables to forward to the container."}, + ) + + timeout: int = 60 + + def __post_init__(self) -> None: + if self.ports["80/tcp"][1] == 0: + LOGGER.info("\t+ Getting a free port for the server") + self.ports["80/tcp"] = (self.ports["80/tcp"][0], get_free_port()) + + +class DockerInferenceServer(ABC): + NAME: str = "Docker-Inference-Server" + SUCCESS_SENTINEL: str = "Success" + FAILURE_SENTINEL: str = "Failure" + + def __init__(self, config: DockerInferenceServerConfig) -> None: + self.config = config + + try: + LOGGER.info(f"\t+ Checking if {self.NAME} image is available locally") + DOCKER.images.get(self.config.image) + LOGGER.info(f"\t+ {self.NAME} image found locally") + except docker.errors.ImageNotFound: + LOGGER.info(f"\t+ {self.NAME} image not found locally, pulling from Docker Hub") + DOCKER.images.pull(self.config.image) + + if self.config.gpus is not None and isinstance(self.config.gpus, str) and self.config.gpus == "all": + LOGGER.info("\t+ Using all GPU(s)") + self.device_requests = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] + elif self.config.gpus is not None and isinstance(self.config.gpus, int): + LOGGER.info(f"\t+ Using {self.config.gpus} GPU(s)") + self.device_requests = [docker.types.DeviceRequest(count=self.config.gpus, capabilities=[["gpu"]])] + elif ( + self.config.gpus is not None + and isinstance(self.config.gpus, str) + and re.match(r"^\d+(,\d+)*$", self.config.gpus) + ): + LOGGER.info(f"\t+ Using GPU(s) {self.config.gpus}") + self.device_requests = [docker.types.DeviceRequest(device_ids=[self.config.gpus], capabilities=[["gpu"]])] + else: + LOGGER.info("\t+ Not using any GPU(s)") + self.device_requests = None + + LOGGER.info(f"\t+ Building {self.NAME} command") + self.command = [] + for k, v in asdict(self.config).items(): + if k in DockerInferenceServerConfig.__annotations__: + continue + elif v is not None: + if isinstance(v, bool): + self.command.append(f"--{k.replace('_', '-')}") + else: + self.command.append(f"--{k.replace('_', '-')}={v}") + + address, port = self.config.ports["80/tcp"] + self.url = f"http://{address}:{port}" + + LOGGER.info(f"\t+ Running {self.NAME} container") + self.container = DOCKER.containers.run( + image=self.config.image, + ports=self.config.ports, + volumes=self.config.volumes, + devices=self.config.devices, + shm_size=self.config.shm_size, + environment=self.config.environment, + device_requests=self.device_requests, + command=self.command, + auto_remove=True, + detach=True, + ) + + LOGGER.info(f"\t+ Streaming {self.NAME} server logs") + for line in self.container.logs(stream=True): + log = line.decode("utf-8").strip() + if self.SUCCESS_SENTINEL.lower() in log.lower(): + LOGGER.info(f"\t {log}") + break + elif self.FAILURE_SENTINEL.lower() in log.lower(): + LOGGER.info(f"\t {log}") + raise Exception(f"{self.NAME} server failed to start") + else: + LOGGER.info(f"\t {log}") + + LOGGER.info(f"\t+ Waiting for {self.NAME} server to be ready") + start_time = time.time() + while time.time() - start_time < self.config.timeout: + try: + if not hasattr(self, "client"): + self.client = AsyncInferenceClient(model=self.url) + + asyncio.run(self.single_client_call(f"Hello {self.NAME}!")) + LOGGER.info(f"\t+ Connected to {self.NAME} server successfully") + break + except Exception: + LOGGER.info(f"\t+ {self.NAME} server is not ready yet, waiting 1 second") + time.sleep(1) + + async def single_client_call(self, *args, **kwargs) -> Any: + raise NotImplementedError + + async def batch_client_call(self, *args, **kwargs) -> Any: + raise NotImplementedError + + def close(self) -> None: + if hasattr(self, "container"): + LOGGER.info("\t+ Stoping Docker container") + self.container.stop() + self.container.wait() + del self.container + LOGGER.info("\t+ Docker container stopped") + + if hasattr(self, "client"): + LOGGER.info("\t+ Stoping Inference client") + del self.client + LOGGER.info("\t+ Inference client stopped") + + def __del__(self) -> None: + self.close() diff --git a/py_txi/text_embedding_inference.py b/py_txi/text_embedding_inference.py new file mode 100644 index 0000000..38e19d1 --- /dev/null +++ b/py_txi/text_embedding_inference.py @@ -0,0 +1,56 @@ +import asyncio +from dataclasses import dataclass +from logging import getLogger +from typing import List, Literal, Optional, Union + +import numpy as np + +from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig + +LOGGER = getLogger("TEI") + + +Pooling_Literal = Literal["cls", "mean"] +DType_Literal = Literal["float32", "float16"] + + +@dataclass(order=False) +class TEIConfig(DockerInferenceServerConfig): + # Docker options + image: str = "ghcr.io/huggingface/text-embeddings-inference:cpu-latest" + # Launcher options + model_id: str = "bert-base-uncased" + revision: str = "main" + dtype: Optional[DType_Literal] = None + pooling: Optional[Pooling_Literal] = None + tokenization_workers: Optional[int] = None + + def __post_init__(self) -> None: + super().__post_init__() + + +class TEI(DockerInferenceServer): + NAME: str = "Text-Embedding-Inference" + SUCCESS_SENTINEL: str = "Ready" + FAILURE_SENTINEL: str = "Error" + + def __init__(self, config: TEIConfig) -> None: + super().__init__(config) + + async def single_client_call(self, text: str, **kwargs) -> np.ndarray: + output = await self.client.feature_extraction(text=text, **kwargs) + return output + + async def batch_client_call(self, text: List[str], **kwargs) -> List[np.ndarray]: + output = await asyncio.gather(*[self.single_client_call(t, **kwargs) for t in text]) + return output + + def encode(self, text: Union[str, List[str]], **kwargs) -> Union[np.ndarray, List[np.ndarray]]: + if isinstance(text, str): + output = asyncio.run(self.single_client_call(text, **kwargs)) + return output + elif isinstance(text, list): + output = asyncio.run(self.batch_client_call(text, **kwargs)) + return output + else: + raise ValueError(f"Unsupported input type: {type(text)}") diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py new file mode 100644 index 0000000..10bad10 --- /dev/null +++ b/py_txi/text_generation_inference.py @@ -0,0 +1,84 @@ +import asyncio +from dataclasses import dataclass +from logging import getLogger +from typing import Literal, Optional, Union + +from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig + +LOGGER = getLogger("TGI") + + +DType_Literal = Literal["float32", "float16", "bfloat16"] +Quantize_Literal = Literal["bitsandbytes-nf4", "bitsandbytes-fp4", "gptq"] + + +@dataclass +class TGIConfig(DockerInferenceServerConfig): + # { model_id: "gpt2", revision: Some("main"), validation_workers: 2, sharded: None, num_shard: None, quantize: None, speculate: None, dtype: None, trust_remote_code: false, max_concurrent_requests: 128, max_best_of: 2, max_stop_sequences: 4, max_top_n_tokens: 5, max_input_length: 1024, max_total_tokens: 2048, waiting_served_ratio: 1.2, max_batch_prefill_tokens: 4096, max_batch_total_tokens: None, max_waiting_tokens: 20, max_batch_size: None, enable_cuda_graphs: false, hostname: "6fedb07983ae", port: 80, shard_uds_path: "/tmp/text-generation-server", master_addr: "localhost", master_port: 29500, huggingface_hub_cache: Some("/data"), weights_cache_override: None, disable_custom_kernels: false, cuda_memory_fraction: 1.0, rope_scaling: None, rope_factor: None, json_output: false, otlp_endpoint: None, cors_allow_origin: [], watermark_gamma: None, watermark_delta: None, ngrok: false, ngrok_authtoken: None, ngrok_edge: None, tokenizer_config_path: None, disable_grammar_support: false, env: false } + + # Docker options + image: str = "ghcr.io/huggingface/text-generation-inference:latest" + # Launcher options + model_id: str = "gpt2" + revision: str = "main" + dtype: Optional[DType_Literal] = None + quantize: Optional[Quantize_Literal] = None + sharded: Optional[bool] = None + num_shard: Optional[int] = None + trust_remote_code: Optional[bool] = None + disable_custom_kernels: Optional[bool] = None + # Inference options + max_best_of: Optional[int] = None + max_concurrent_requests: Optional[int] = None + max_stop_sequences: Optional[int] = None + max_top_n_tokens: Optional[int] = None + max_input_length: Optional[int] = None + max_total_tokens: Optional[int] = None + waiting_served_ratio: Optional[float] = None + max_batch_prefill_tokens: Optional[int] = None + max_batch_total_tokens: Optional[int] = None + max_waiting_tokens: Optional[int] = None + max_batch_size: Optional[int] = None + enable_cuda_graphs: Optional[bool] = None + huggingface_hub_cache: Optional[str] = None + weights_cache_override: Optional[str] = None + cuda_memory_fraction: Optional[float] = None + rope_scaling: Optional[str] = None + rope_factor: Optional[str] = None + json_output: Optional[bool] = None + otlp_endpoint: Optional[str] = None + cors_allow_origin: Optional[list] = None + watermark_gamma: Optional[str] = None + watermark_delta: Optional[str] = None + tokenizer_config_path: Optional[str] = None + disable_grammar_support: Optional[bool] = None + + def __post_init__(self) -> None: + super().__post_init__() + + +class TGI(DockerInferenceServer): + NAME: str = "Text-Generation-Inference" + SUCCESS_SENTINEL: str = "Connected" + FAILURE_SENTINEL: str = "Error" + + def __init__(self, config: TGIConfig) -> None: + super().__init__(config) + + async def single_client_call(self, prompt: str, **kwargs) -> str: + output = await self.client.text_generation(prompt=prompt, **kwargs) + return output + + async def batch_client_call(self, prompt: list, **kwargs) -> list: + output = await asyncio.gather(*[self.single_client_call(prompt=p, **kwargs) for p in prompt]) + return output + + def generate(self, prompt: Union[str, list], **kwargs) -> Union[str, list]: + if isinstance(prompt, str): + output = asyncio.run(self.single_client_call(prompt, **kwargs)) + return output + elif isinstance(prompt, list): + output = asyncio.run(self.batch_client_call(prompt, **kwargs)) + return output + else: + raise ValueError(f"Unsupported input type: {type(prompt)}") diff --git a/py_tgi/utils.py b/py_txi/utils.py similarity index 67% rename from py_tgi/utils.py rename to py_txi/utils.py index eeacc70..473d042 100644 --- a/py_tgi/utils.py +++ b/py_txi/utils.py @@ -1,8 +1,11 @@ -import os +import socket import subprocess -HF_CACHE_DIR = os.path.expanduser("~/.cache/huggingface/hub") -CONNECTION_TIMEOUT = 60 + +def get_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] def is_rocm_system() -> bool: diff --git a/setup.py b/setup.py index 11b3320..d995654 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,6 @@ version=PY_TGI_VERSION, packages=find_packages(), install_requires=["docker", "huggingface-hub", "numpy"], - extras_require={"quality": ["ruff"]}, + extras_require={"quality": ["ruff"], "testing": ["pytest"]}, **common_setup_kwargs, ) diff --git a/tests/test.py b/tests/test.py index d55f119..573b109 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,15 +1,20 @@ import numpy as np -from py_tgi import TEI, TGI - -embed = TEI(model="bert-base-uncased", dtype="float16", pooling="mean", port=1234) -output = embed.encode("Hi, I'm a language model") -assert isinstance(output, np.ndarray) -output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"]) -assert isinstance(output, list) and all(isinstance(x, np.ndarray) for x in output) - -llm = TGI(model="gpt2", sharded=False, port=4321) -output = llm.generate("Hi, I'm a sanity test") -assert isinstance(output, str) -output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"]) -assert isinstance(output, list) and all(isinstance(x, str) for x in output) +from py_txi.text_embedding_inference import TEI, TEIConfig +from py_txi.text_generation_inference import TGI, TGIConfig + + +def test_tei(): + embed = TEI(config=TEIConfig(pooling="cls", gpus=1)) + output = embed.encode("Hi, I'm a language model") + assert isinstance(output, np.ndarray) + output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"]) + assert isinstance(output, list) and all(isinstance(x, np.ndarray) for x in output) + + +def test_tgi(): + llm = TGI(config=TGIConfig(dtype="float16", gpus=1)) + output = llm.generate("Hi, I'm a sanity test") + assert isinstance(output, str) + output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"]) + assert isinstance(output, list) and all(isinstance(x, str) for x in output) From 54f513277b67f7a157c5a0a6897c6f067912f8ac Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 1 Mar 2024 06:56:14 +0100 Subject: [PATCH 2/9] fix tests and simplify examples --- example.py | 13 ++----------- py_txi/__init__.py | 6 +++--- py_txi/docker_inference_server.py | 4 +--- tests/{test.py => test_txi.py} | 0 4 files changed, 6 insertions(+), 17 deletions(-) rename tests/{test.py => test_txi.py} (100%) diff --git a/example.py b/example.py index 42f6557..50a6033 100644 --- a/example.py +++ b/example.py @@ -1,21 +1,12 @@ from py_txi.text_embedding_inference import TEI, TEIConfig from py_txi.text_generation_inference import TGI, TGIConfig -from py_txi.utils import get_free_port -port = get_free_port() -ports = {"80/tcp": ("127.0.0.1", port)} - -tei_config = TEIConfig(pooling="cls", ports=ports) -embed = TEI(tei_config) +embed = TEI(config=TEIConfig(pooling="cls")) output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"]) print("Embed:", output) embed.close() -port = get_free_port() -ports = {"80/tcp": ("127.0.0.1", port)} - -tgi_config = TGIConfig(ports=ports) -llm = TGI(tgi_config) +llm = TGI(config=TGIConfig(sharded=False)) output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"]) print("LLM:", output) llm.close() diff --git a/py_txi/__init__.py b/py_txi/__init__.py index 16c2c4f..1001649 100644 --- a/py_txi/__init__.py +++ b/py_txi/__init__.py @@ -1,3 +1,3 @@ -# from .text_embedding_inference import TEI # noqa -# from .text_generation_inference import TGI # noqa -# from .utils import is_nvidia_system, is_rocm_system # noqa +from .text_embedding_inference import TEI # noqa +from .text_generation_inference import TGI # noqa +from .utils import is_nvidia_system, is_rocm_system, get_free_port # noqa diff --git a/py_txi/docker_inference_server.py b/py_txi/docker_inference_server.py index bd8302e..a233b4e 100644 --- a/py_txi/docker_inference_server.py +++ b/py_txi/docker_inference_server.py @@ -150,13 +150,11 @@ def close(self) -> None: LOGGER.info("\t+ Stoping Docker container") self.container.stop() self.container.wait() - del self.container LOGGER.info("\t+ Docker container stopped") + del self.container if hasattr(self, "client"): - LOGGER.info("\t+ Stoping Inference client") del self.client - LOGGER.info("\t+ Inference client stopped") def __del__(self) -> None: self.close() diff --git a/tests/test.py b/tests/test_txi.py similarity index 100% rename from tests/test.py rename to tests/test_txi.py From a1b8e4a7576e83bfad11f86e49f8161643e86dee Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 1 Mar 2024 07:00:41 +0100 Subject: [PATCH 3/9] fix test --- tests/test_txi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_txi.py b/tests/test_txi.py index 573b109..fecc525 100644 --- a/tests/test_txi.py +++ b/tests/test_txi.py @@ -5,7 +5,7 @@ def test_tei(): - embed = TEI(config=TEIConfig(pooling="cls", gpus=1)) + embed = TEI(config=TEIConfig(pooling="cls")) output = embed.encode("Hi, I'm a language model") assert isinstance(output, np.ndarray) output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"]) @@ -13,7 +13,7 @@ def test_tei(): def test_tgi(): - llm = TGI(config=TGIConfig(dtype="float16", gpus=1)) + llm = TGI(config=TGIConfig(dtype="float16")) output = llm.generate("Hi, I'm a sanity test") assert isinstance(output, str) output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"]) From a45487d71809978c543c6be9d7c8bb75b8b56df0 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 1 Mar 2024 07:13:35 +0100 Subject: [PATCH 4/9] added aiohttp --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d995654..d9966b1 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ name="py-tgi", version=PY_TGI_VERSION, packages=find_packages(), - install_requires=["docker", "huggingface-hub", "numpy"], + install_requires=["docker", "huggingface-hub", "numpy", "aiohttp"], extras_require={"quality": ["ruff"], "testing": ["pytest"]}, **common_setup_kwargs, ) From 31a5e2f86698668d80b98bdab6128d6f050022d4 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 1 Mar 2024 07:28:01 +0100 Subject: [PATCH 5/9] fix --- example.py | 2 +- py_txi/text_generation_inference.py | 4 ++-- tests/test_txi.py | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/example.py b/example.py index 50a6033..20b6c43 100644 --- a/example.py +++ b/example.py @@ -6,7 +6,7 @@ print("Embed:", output) embed.close() -llm = TGI(config=TGIConfig(sharded=False)) +llm = TGI(config=TGIConfig(sharded="false")) output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"]) print("LLM:", output) llm.close() diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py index 10bad10..1c9a314 100644 --- a/py_txi/text_generation_inference.py +++ b/py_txi/text_generation_inference.py @@ -7,7 +7,7 @@ LOGGER = getLogger("TGI") - +Shareded_Literal = Literal["true", "false"] DType_Literal = Literal["float32", "float16", "bfloat16"] Quantize_Literal = Literal["bitsandbytes-nf4", "bitsandbytes-fp4", "gptq"] @@ -23,7 +23,7 @@ class TGIConfig(DockerInferenceServerConfig): revision: str = "main" dtype: Optional[DType_Literal] = None quantize: Optional[Quantize_Literal] = None - sharded: Optional[bool] = None + sharded: Optional[Shareded_Literal] = None num_shard: Optional[int] = None trust_remote_code: Optional[bool] = None disable_custom_kernels: Optional[bool] = None diff --git a/tests/test_txi.py b/tests/test_txi.py index fecc525..96efbde 100644 --- a/tests/test_txi.py +++ b/tests/test_txi.py @@ -10,6 +10,7 @@ def test_tei(): assert isinstance(output, np.ndarray) output = embed.encode(["Hi, I'm a language model", "I'm fine, how are you?"]) assert isinstance(output, list) and all(isinstance(x, np.ndarray) for x in output) + embed.close() def test_tgi(): @@ -18,3 +19,4 @@ def test_tgi(): assert isinstance(output, str) output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"]) assert isinstance(output, list) and all(isinstance(x, str) for x in output) + llm.close() From cb9e31ed617522fe55042b516d68dbb5bda67402 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 1 Mar 2024 07:42:30 +0100 Subject: [PATCH 6/9] fix fr --- py_txi/text_generation_inference.py | 2 -- tests/test_txi.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py index 1c9a314..e6d0625 100644 --- a/py_txi/text_generation_inference.py +++ b/py_txi/text_generation_inference.py @@ -14,8 +14,6 @@ @dataclass class TGIConfig(DockerInferenceServerConfig): - # { model_id: "gpt2", revision: Some("main"), validation_workers: 2, sharded: None, num_shard: None, quantize: None, speculate: None, dtype: None, trust_remote_code: false, max_concurrent_requests: 128, max_best_of: 2, max_stop_sequences: 4, max_top_n_tokens: 5, max_input_length: 1024, max_total_tokens: 2048, waiting_served_ratio: 1.2, max_batch_prefill_tokens: 4096, max_batch_total_tokens: None, max_waiting_tokens: 20, max_batch_size: None, enable_cuda_graphs: false, hostname: "6fedb07983ae", port: 80, shard_uds_path: "/tmp/text-generation-server", master_addr: "localhost", master_port: 29500, huggingface_hub_cache: Some("/data"), weights_cache_override: None, disable_custom_kernels: false, cuda_memory_fraction: 1.0, rope_scaling: None, rope_factor: None, json_output: false, otlp_endpoint: None, cors_allow_origin: [], watermark_gamma: None, watermark_delta: None, ngrok: false, ngrok_authtoken: None, ngrok_edge: None, tokenizer_config_path: None, disable_grammar_support: false, env: false } - # Docker options image: str = "ghcr.io/huggingface/text-generation-inference:latest" # Launcher options diff --git a/tests/test_txi.py b/tests/test_txi.py index 96efbde..baa9c36 100644 --- a/tests/test_txi.py +++ b/tests/test_txi.py @@ -14,7 +14,7 @@ def test_tei(): def test_tgi(): - llm = TGI(config=TGIConfig(dtype="float16")) + llm = TGI(config=TGIConfig(sharded="false")) output = llm.generate("Hi, I'm a sanity test") assert isinstance(output, str) output = llm.generate(["Hi, I'm a sanity test", "I'm a second sentence"]) From 17d7a4dda6a6eb5440b9a27234ca17bd966e3af6 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 1 Mar 2024 07:49:45 +0100 Subject: [PATCH 7/9] update stuff --- .github/workflows/{quality_checks.yaml => quality.yaml} | 8 ++++---- .github/workflows/{tests.yaml => test.yaml} | 6 +++--- Makefile | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) rename .github/workflows/{quality_checks.yaml => quality.yaml} (86%) rename .github/workflows/{tests.yaml => test.yaml} (88%) diff --git a/.github/workflows/quality_checks.yaml b/.github/workflows/quality.yaml similarity index 86% rename from .github/workflows/quality_checks.yaml rename to .github/workflows/quality.yaml index a7bc11e..86fe496 100644 --- a/.github/workflows/quality_checks.yaml +++ b/.github/workflows/quality.yaml @@ -1,4 +1,4 @@ -name: quality checks +name: quality on: push: @@ -13,10 +13,10 @@ concurrency: cancel-in-progress: true jobs: - run_quality_checks: + check_quality: runs-on: ubuntu-latest steps: - - name: Checkout + - name: Checkout code uses: actions/checkout@v3 - name: Set up Python 3.10 @@ -29,6 +29,6 @@ jobs: pip install --upgrade pip pip install -e .[quality] - - name: Check style + - name: Check quality run: | make quality diff --git a/.github/workflows/tests.yaml b/.github/workflows/test.yaml similarity index 88% rename from .github/workflows/tests.yaml rename to .github/workflows/test.yaml index 0f73035..c29376a 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/test.yaml @@ -1,4 +1,4 @@ -name: tests +name: test on: push: @@ -24,11 +24,11 @@ jobs: with: python-version: "3.10" - - name: Install requirements + - name: Install testing requirements run: | pip install --upgrade pip pip install -e .[testing] - name: Run test run: | - pytest tests/ + make test diff --git a/Makefile b/Makefile index 9ff0121..493340b 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ style: ruff check --fix . test: - python tests/test.py + pytest tests/ -x install: pip install -e . \ No newline at end of file From 75855a73c3e773a7914460902c9499ce6d0a7f91 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 5 Mar 2024 03:15:15 +0000 Subject: [PATCH 8/9] auto rocm --- py_txi/text_embedding_inference.py | 7 +++++++ py_txi/text_generation_inference.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/py_txi/text_embedding_inference.py b/py_txi/text_embedding_inference.py index 38e19d1..86ba5c0 100644 --- a/py_txi/text_embedding_inference.py +++ b/py_txi/text_embedding_inference.py @@ -6,6 +6,7 @@ import numpy as np from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig +from .utils import is_nvidia_system LOGGER = getLogger("TEI") @@ -28,6 +29,12 @@ class TEIConfig(DockerInferenceServerConfig): def __post_init__(self) -> None: super().__post_init__() + if is_nvidia_system() and "cpu" in self.image: + LOGGER.warning( + "Your system has NVIDIA GPU, but you are using a CPU image." + "Consider using a GPU image for better performance." + ) + class TEI(DockerInferenceServer): NAME: str = "Text-Embedding-Inference" diff --git a/py_txi/text_generation_inference.py b/py_txi/text_generation_inference.py index e6d0625..f297bd0 100644 --- a/py_txi/text_generation_inference.py +++ b/py_txi/text_generation_inference.py @@ -4,6 +4,7 @@ from typing import Literal, Optional, Union from .docker_inference_server import DockerInferenceServer, DockerInferenceServerConfig +from .utils import is_rocm_system LOGGER = getLogger("TGI") @@ -54,6 +55,13 @@ class TGIConfig(DockerInferenceServerConfig): def __post_init__(self) -> None: super().__post_init__() + if is_rocm_system() and "rocm" not in self.image: + LOGGER.warning( + "You are running on a ROCm system but the image is not rocm specific. " + "Add 'rocm' to the image name to use the rocm specific image." + ) + self.image += "-rocm" + class TGI(DockerInferenceServer): NAME: str = "Text-Generation-Inference" From c59156210b53c14dfefda9ae0175d559cffaaab7 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 5 Mar 2024 03:15:24 +0000 Subject: [PATCH 9/9] update readme and setup --- README.md | 47 ++++++++++++++++++++--------------------------- setup.py | 8 ++++---- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 76553d1..63ff32a 100644 --- a/README.md +++ b/README.md @@ -1,51 +1,44 @@ -# Py-TGI (Py-TXI at this point xD) +# Py-TXI (previously Py-TGI) -[![PyPI version](https://badge.fury.io/py/py-tgi.svg)](https://badge.fury.io/py/py-tgi) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/py-tgi)](https://pypi.org/project/py-tgi/) -[![PyPI - Format](https://img.shields.io/pypi/format/py-tgi)](https://pypi.org/project/py-tgi/) -[![Downloads](https://pepy.tech/badge/py-tgi)](https://pepy.tech/project/py-tgi) -[![PyPI - License](https://img.shields.io/pypi/l/py-tgi)](https://pypi.org/project/py-tgi/) -[![Tests](https://github.com/IlyasMoutawwakil/py-tgi/actions/workflows/tests.yaml/badge.svg)](https://github.com/IlyasMoutawwakil/py-tgi/actions/workflows/tests.yaml) +[![PyPI version](https://badge.fury.io/py/py-txi.svg)](https://badge.fury.io/py/py-txi) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/py-txi)](https://pypi.org/project/py-txi/) +[![PyPI - Format](https://img.shields.io/pypi/format/py-txi)](https://pypi.org/project/py-txi/) +[![Downloads](https://pepy.tech/badge/py-txi)](https://pepy.tech/project/py-txi) +[![PyPI - License](https://img.shields.io/pypi/l/py-txi)](https://pypi.org/project/py-txi/) +[![Tests](https://github.com/IlyasMoutawwakil/py-txi/actions/workflows/tests.yaml/badge.svg)](https://github.com/IlyasMoutawwakil/py-txi/actions/workflows/tests.yaml) -Py-TGI is a Python wrapper around [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference) and [Text-Embedding-Inference](https://github.com/huggingface/text-embeddings-inference) that enables creating and running TGI/TEI instances through the awesome `docker-py` in a similar style to Transformers API. +Py-TXI is a Python wrapper around [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference) and [Text-Embedding-Inference](https://github.com/huggingface/text-embeddings-inference) that enables creating and running TGI/TEI instances through the awesome `docker-py` in a similar style to Transformers API. ## Installation ```bash -pip install py-tgi +pip install py-txi ``` -Py-TGI is designed to be used in a similar way to Transformers API. We use `docker-py` (instead of a dirty `subprocess` solution) so that the containers you run are linked to the main process and are stopped automatically when your code finishes or fails. +Py-TXI is designed to be used in a similar way to Transformers API. We use `docker-py` (instead of a dirty `subprocess` solution) so that the containers you run are linked to the main process and are stopped automatically when your code finishes or fails. ## Usage Here's an example of how to use it: ```python -from py_tgi import TGI, is_nvidia_system, is_rocm_system +from py_txi import TGI, is_nvidia_system, is_rocm_system -llm = TGI( - model="NousResearch/Llama-2-7b-hf", - devices=["/dev/kfd", "/dev/dri"] if is_rocm_system() else None, - gpus="all" if is_nvidia_system() else None, -) +llm = TGI(config=TGIConfig(sharded="false")) output = llm.generate(["Hi, I'm a language model", "I'm fine, how are you?"]) -print(output) +print("LLM:", output) +llm.close() ``` -Output: ```[" and I'm here to help you with any questions you have. What can I help you with", "\nUser 0: I'm doing well, thanks for asking. I'm just a"]``` +Output: ```LLM: ["er. I'm a language modeler. I'm a language modeler. I'm a language", " I'm fine, how are you? I'm fine, how are you? I'm fine,"]``` ```python -from py_tgi import TEI, is_nvidia_system - -embed = TEI( - model="BAAI/bge-large-en-v1.5", - dtype="float16", - pooling="mean", - gpus="all" if is_nvidia_system() else None, -) +from py_txi import TEI, is_nvidia_system + +embed = TEI(config=TEIConfig(pooling="cls")) output = embed.encode(["Hi, I'm an embedding model", "I'm fine, how are you?"]) -print(output) +print("Embed:", output) +embed.close() ``` Output: ```[array([[ 0.01058742, -0.01588806, -0.03487622, ..., -0.01613717, diff --git a/setup.py b/setup.py index d9966b1..f283131 100644 --- a/setup.py +++ b/setup.py @@ -2,14 +2,14 @@ from setuptools import find_packages, setup -PY_TGI_VERSION = "0.2.0" +PY_TXI_VERSION = "0.4.0" common_setup_kwargs = { "author": "Ilyas Moutawwakil", "author_email": "ilyas.moutawwakil@gmail.com", "description": "A Python wrapper around TGI and TEI servers", "keywords": ["tgi", "llm", "tei", "embedding", "huggingface", "docker", "python"], - "url": "https://github.com/IlyasMoutawwakil/py-tgi", + "url": "https://github.com/IlyasMoutawwakil/py-txi", "long_description_content_type": "text/markdown", "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"), "platforms": ["linux", "windows", "macos"], @@ -21,8 +21,8 @@ setup( - name="py-tgi", - version=PY_TGI_VERSION, + name="py-txi", + version=PY_TXI_VERSION, packages=find_packages(), install_requires=["docker", "huggingface-hub", "numpy", "aiohttp"], extras_require={"quality": ["ruff"], "testing": ["pytest"]},