-
Notifications
You must be signed in to change notification settings - Fork 97
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8077cf1
commit 48a0b27
Showing
21 changed files
with
316 additions
and
98 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
FROM nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 | ||
|
||
LABEL maintainer="Morgan Funtowicz <[email protected]>" | ||
|
||
ARG VCS_REF | ||
ARG BUILD_DATE | ||
ARG BUILD_VERSION | ||
|
||
LABEL org.label-schema.schema-version="1.0" | ||
LABEL org.label-schema.name="huggingface/inference-endpoints-trtllm" | ||
LABEL org.label-schema.build-date=$BUILD_DATE | ||
LABEL org.label-schema.version=$BUILD_VERSION | ||
LABEL org.label-schema.vcs-ref=$VCS_REF | ||
LABEL org.label-schema.vendor="Hugging Face Inc." | ||
LABEL org.label-schema.version="1.0.0" | ||
LABEL org.label-schema.url="https://hf.co/hardware" | ||
LABEL org.label-schema.vcs-url="https://github.com/huggingface/optimum-nvidia" | ||
LABEL org.label-schema.decription="Hugging Face Inference Server docker image for TensorRT-LLM Inference" | ||
|
||
ENV HF_HUB_TOKEN "" | ||
|
||
|
||
# Expose (in-order) HTTP, GRPC, Metrics endpoints | ||
EXPOSE 8000/tcp | ||
EXPOSE 8001/tcp | ||
EXPOSE 8002/tcp | ||
|
||
WORKDIR /repository | ||
|
||
#ENTRYPOINT "huggingface-cli login --token ${HF_HUB_TOKEN} | ||
CMD ["mpirun", "--allow-run-as-root", "-n", "1", "/opt/tritonserver/bin/tritonserver", "--exit-on-error=false", "--model-repo=/repository"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from typing import NamedTuple | ||
|
||
from tensorrt_llm.quantization import QuantMode | ||
|
||
|
||
QuantizationConfig = NamedTuple("QuantizationConfig", [ | ||
("mode", QuantMode), | ||
("group_size", int), | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from typing import Optional | ||
|
||
from huggingface_hub import ModelHubMixin | ||
|
||
from tensorrt_llm import ModelConfig, Mapping | ||
from tensorrt_llm.runtime import GenerationSession | ||
|
||
|
||
class TRTEnginePretrainedModel(ModelHubMixin): | ||
pass | ||
|
||
|
||
class TRTEngineForCausalLM(TRTEnginePretrainedModel): | ||
__slots__ = ("_config", "_mapping", "_session", "_cache", "_stream") | ||
|
||
def __init__( | ||
self, | ||
config: ModelConfig, | ||
mapping: Mapping, | ||
engine: bytes, | ||
stream: Optional["torch.cuda.Stream"] = None, | ||
use_cuda_graph: bool = False, | ||
): | ||
super().__init__() | ||
|
||
self._config = config | ||
self._mapping = mapping | ||
self._stream = stream | ||
self._session = GenerationSession( | ||
model_config=config, | ||
engine_buffer=engine, | ||
mapping=mapping, | ||
stream=stream, | ||
cuda_graph_mode=use_cuda_graph | ||
) |
Oops, something went wrong.