Skip to content

Commit

Permalink
Sync
Browse files Browse the repository at this point in the history
  • Loading branch information
mfuntowicz committed Nov 8, 2023
1 parent 8077cf1 commit 48a0b27
Show file tree
Hide file tree
Showing 21 changed files with 316 additions and 98 deletions.
2 changes: 1 addition & 1 deletion docker/Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ RUN cuda_version=$(nvcc --version | grep 'release' | awk '{print $6}' | awk -F'[
rm -rf nvidia_ammo-${NVIDIA_AMMO_VERSION}*

# Install dependencies
RUN python -m pip install huggingface_hub
RUN python -m pip install datasets, huggingface_hub, transformers

WORKDIR /workspace
31 changes: 31 additions & 0 deletions docker/Dockerfile.endpoint
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3

LABEL maintainer="Morgan Funtowicz <[email protected]>"

ARG VCS_REF
ARG BUILD_DATE
ARG BUILD_VERSION

LABEL org.label-schema.schema-version="1.0"
LABEL org.label-schema.name="huggingface/inference-endpoints-trtllm"
LABEL org.label-schema.build-date=$BUILD_DATE
LABEL org.label-schema.version=$BUILD_VERSION
LABEL org.label-schema.vcs-ref=$VCS_REF
LABEL org.label-schema.vendor="Hugging Face Inc."
LABEL org.label-schema.version="1.0.0"
LABEL org.label-schema.url="https://hf.co/hardware"
LABEL org.label-schema.vcs-url="https://github.com/huggingface/optimum-nvidia"
LABEL org.label-schema.decription="Hugging Face Inference Server docker image for TensorRT-LLM Inference"

ENV HF_HUB_TOKEN ""


# Expose (in-order) HTTP, GRPC, Metrics endpoints
EXPOSE 8000/tcp
EXPOSE 8001/tcp
EXPOSE 8002/tcp

WORKDIR /repository

#ENTRYPOINT "huggingface-cli login --token ${HF_HUB_TOKEN}
CMD ["mpirun", "--allow-run-as-root", "-n", "1", "/opt/tritonserver/bin/tritonserver", "--exit-on-error=false", "--model-repo=/repository"]
40 changes: 29 additions & 11 deletions examples/text-generation/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@
from pathlib import Path

from transformers import AutoTokenizer

from optimum.nvidia import setup_logging

# Setup logging
setup_logging(False)

from optimum.nvidia import TRTEngineBuilder
from optimum.nvidia import TRTEngineBuilder, TRTEngineForCausalLM
from optimum.nvidia.models.llama import LlamaWeightAdapter
from optimum.nvidia.utils.cli import *

Expand All @@ -33,10 +32,11 @@

if __name__ == '__main__':
parser = ArgumentParser("🤗 TensorRT-LLM Llama implementation")
parser.add_argument("--hub-token", type=str, help="Hugging Face Hub Token to retrieve private weights.")
register_common_model_topology_args(parser)
register_optimization_profiles_args(parser)
register_quantization_args(parser) # Inject params.quantization_config
register_triton_server_args(parser)
register_quantization_args(parser)

parser.add_argument("model", type=str, help="The model's id or path to use.")
parser.add_argument("output", type=Path, help="Path to store generated TensorRT engine.")
Expand All @@ -55,19 +55,37 @@
args.output.mkdir()

LOGGER.info(f"Exporting {args.model} to TensorRT-LLM engine at {args.output}")
tokenizer = AutoTokenizer.from_pretrained(args.model)
builder = TRTEngineBuilder.from_pretrained(args.model, adapter=LlamaWeightAdapter) \
.to(args.dtype) \
.shard(args.tensor_parallelism, args.pipeline_parallelism, args.world_size, args.gpus_per_node) \
.with_quantization_profile(args.quantization_mode) \
.with_generation_profile(args.max_batch_size, args.max_prompt_length, args.max_new_tokens) \
.with_sampling_strategy(args.max_beam_width)
builder.build(args.output)
if args.hub_token is not None:
from huggingface_hub import login
login(args.hub_token, )

if args.has_quantization_step:
from optimum.nvidia.weights.quantization.ammo import AmmoQuantizer
LOGGER.info(f"About to calibrate model for quantization {args.quantization_config}")
quantizer = AmmoQuantizer.from_pretrained(args.model)

tokenizer = AutoTokenizer.from_pretrained(args.model, token=True)
# builder = TRTEngineBuilder.from_pretrained(args.model, adapter=LlamaWeightAdapter) \
# .to(args.dtype) \
# .shard(args.tensor_parallelism, args.pipeline_parallelism, args.world_size, args.gpus_per_node) \
# .with_quantization_profile(args.quantization_config) \
# .with_generation_profile(args.max_batch_size, args.max_prompt_length, args.max_new_tokens) \
# .with_sampling_strategy(args.max_beam_width)
# builder.build(args.output)

if args.with_triton_structure:
# generator = TritonLayoutGenerator()
LOGGER.info(f"Exporting Triton Inference Server structure at {args.output}")
tokenizer_output = args.output.joinpath("tokenizer/")
tokenizer.save_pretrained(tokenizer_output)

with open(args.output.joinpath("config.json"), mode="r", encoding="utf-8") as config_f:
from json import load
config = load(config_f)

with open(args.output.joinpath("llama_float16_tp1_rank0.engine"), mode="rb") as model_f:
from tensorrt_llm import Mapping
engine = model_f.read()
model = TRTEngineForCausalLM(config, Mapping(), engine, use_cuda_graph=False)

print(f"TRTLLM engines have been saved at {args.output}.")
13 changes: 11 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"optimum >= 1.13.0",
]

TESTS_REQUIRE = [
TESTS_REQUIRES = [
"pytest",
"psutil",
"parameterized",
Expand All @@ -49,9 +49,18 @@
"hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git",
]


QUANTIZATION_REQUIRES = [
"ammo" # This one is a bit harder to install ...
"datasets"
"transformers",
"torch",
]

EXTRAS_REQUIRE = {
"tests": TESTS_REQUIRE,
"tests": TESTS_REQUIRES,
"quality": QUALITY_REQUIRES,
"quantization": QUANTIZATION_REQUIRES,
}

setup(
Expand Down
1 change: 1 addition & 0 deletions src/optimum/nvidia/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@
from .version import __version__, VERSION

from .builder import TRTEngineBuilder
from .runtime import TRTEnginePretrainedModel, TRTEngineForCausalLM

DEFAULT_HF_HUB_TRT_REVISION: str = "trt-llm"
24 changes: 12 additions & 12 deletions src/optimum/nvidia/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
from huggingface_hub import ModelHubMixin, HfFileSystem
from huggingface_hub.hub_mixin import T

from optimum.nvidia.configs import ModelConfig, TransformersConfig
from optimum.nvidia.configs import ModelConfig, TransformersConfig, QuantizationConfig
from optimum.nvidia.errors import UnsupportedOperation
from optimum.nvidia.lang import DataType
from optimum.nvidia.utils import ensure_file_exists_locally
from optimum.nvidia.weights import SupportsSafetensors, WeightAdapter, SupportsWeightCompression, QUANTIZATION_PROTOCOLS
from optimum.nvidia.weights.compression import awq_weight_only_compression
from optimum.nvidia.weights.quantization import to_awq_module
from optimum.nvidia.weights.hub import get_safetensors_files
from optimum.nvidia.utils.onnx import to_onnx

Expand Down Expand Up @@ -112,7 +112,7 @@ def __init__(self, model_id_or_path: Union[str, PathLike], config: ModelConfig,
self._dtype = DataType.FLOAT16
self._build_info: BuildInfo = SERIAL_BUILD
self._sharding_info: ShardingInfo = NO_SHARDING
self._quantization_descriptor = None
self._quantization_descriptor: QuantizationConfig = None
self._optimization_profile: OptimizationProfile = None

# Sampling
Expand Down Expand Up @@ -147,7 +147,7 @@ def shard(self, tp_degree: int, pp_degree: int, world_size: int, num_gpus_per_no

return self

def with_quantization_profile(self, descriptor: QuantMode) -> "TRTEngineBuilder":
def with_quantization_profile(self, descriptor: QuantMode, group_size: int = -1) -> "TRTEngineBuilder":
if not isinstance(self._weight_adapter, SupportsWeightCompression):
raise UnsupportedOperation.quantization(
f"{self._weight_adapter} doesn't implement one of the quantization protocols {QUANTIZATION_PROTOCOLS},"
Expand Down Expand Up @@ -203,7 +203,7 @@ def validate(self) -> bool:
"Quantization descriptor was None, assuming no quantization will be applied. "
"If you want to change this behaviour, please use TRTEngineBuilder.with_quantization_schema()"
)
self._quantization_descriptor = QuantMode(0)
self._quantization_descriptor = QuantizationConfig(QuantMode(0), 0)

# Optimization profile
if self._optimization_profile is None:
Expand Down Expand Up @@ -349,21 +349,21 @@ def _build_engine_for_rank(self, shard: Shard, weight_files: List[PathLike], out
)
# build_config.trt_builder_config.builder_optimization_level = 5

if self._quantization_descriptor.is_weight_only():
qconfig = self._quantization_descriptor
if qconfig.mode.is_weight_only():
if isinstance(self._weight_adapter, SupportsWeightCompression):
weights_compression = self._weight_adapter
quantization_desc = self._quantization_descriptor

# Apply AWQ style weight quantization
model = awq_weight_only_compression(
model = to_awq_module(
model,
quantization_desc,
group_size=128, # TODO: Move to quantization parameter
exclude_modules=weights_compression.EXCLUDED_WEIGHT_PARAMETERS
qconfig.mode,
group_size=qconfig.group_size,
exclude_modules=weights_compression.QUANTIZATION_EXCLUDED_PARAMETERS
)

if issubclass(self._weight_adapter, SupportsSafetensors):
self._weight_adapter.from_safetensors(weight_files, model, config, build_config, shard)
self._weight_adapter.from_safetensors(weight_files, model, config, build_config, qconfig, shard)

# Let's build the network
network = builder.create_network()
Expand Down
1 change: 1 addition & 0 deletions src/optimum/nvidia/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from typing import Protocol

from .base import ModelConfig, TransformersConfig
from .quantization import QuantizationConfig
9 changes: 9 additions & 0 deletions src/optimum/nvidia/configs/quantization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import NamedTuple

from tensorrt_llm.quantization import QuantMode


QuantizationConfig = NamedTuple("QuantizationConfig", [
("mode", QuantMode),
("group_size", int),
])
43 changes: 27 additions & 16 deletions src/optimum/nvidia/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@

import numpy as np

from optimum.nvidia.configs import ModelConfig
from optimum.nvidia.configs import ModelConfig, QuantizationConfig
from optimum.nvidia.lang import DataType
from optimum.nvidia.models import ConvertibleModel
from optimum.nvidia.weights import WeightAdapter, SupportsWeightCompression, shard
from optimum.nvidia.weights.safetensors import SupportsSafetensors, SafetensorsAccessor
from optimum.nvidia.weights import WeightAdapter, shard
from optimum.nvidia.weights import SupportsSafetensors, SupportsWeightCompression
from optimum.nvidia.weights.safetensors import SafetensorsAccessor
from safetensors import deserialize
from tensorrt_llm import BuilderConfig, Mapping as ShardingConfig, Module
from tensorrt_llm.models import LLaMAForCausalLM
Expand All @@ -37,23 +38,25 @@ class LlamaWeightAdapter(WeightAdapter, SupportsSafetensors, SupportsWeightCompr
"""

EXCLUDED_WEIGHT_PARAMETERS = set(["lm_head"])
QUANTIZATION_EXCLUDED_PARAMETERS = set(["lm_head"])
NAMED_WEIGHT_PARAMETERS = {
"self_attn.o_proj.weight": ("attention.dense", 1),
"mlp.up_proj.weight": ("mlp.gate.weight", 0),
"mlp.down_proj.weight": ("mlp.proj.weight", 1),
"mlp.gate_proj.weight": ("mlp.fc.weight", 0)
}

@staticmethod
@property
def named_weight_parameters() -> Iterable[str]:
return {
"self_attn.o_proj.weight",
"mlp.up_proj.weight",
"mlp.down_proj.weight",
"mlp.gate_proj.weight",
}
return LlamaWeightAdapter.NAMED_WEIGHT_PARAMETERS.keys()

def convert(
self,
model: Module,
config: ModelConfig,
builder: BuilderConfig,
qconfig: QuantizationConfig,
rank: int,
weights: Mapping[str, np.array]
) -> Module:
Expand All @@ -78,9 +81,8 @@ def convert(
factor = tp_size // num_head
weight = weight.reshape(num_head, 1, head_size, -1).repeat(factor, axis=1)
weight = weight.reshape(num_head * reps * head_size, -1).clone()
qkv_weight = [q_weight, k_weight, v_weight]
else:
qkv_weight = np.concatenate((q_weight, k_weight, v_weight), axis=0)

qkv_weight = [q_weight, k_weight, v_weight]

# Insert the packed weights inside the weights
qkv_packed_layers.append(qkv_weight)
Expand Down Expand Up @@ -128,20 +130,28 @@ def convert(

# Self attention layer
qkv = qkv_packed_layers[idx]
q, k, v = qkv

# Shard
if config.use_multi_query_attention: # TODO: support GQA
q, k, v = qkv
wq, wk, wv = (
shard(q, rank, shard_info.tp_size, axis=0),
shard(k, rank, shard_info.tp_size, axis=0),
shard(v, rank, shard_info.tp_size, axis=0),
)

qkw_weight = np.concatenate((wq, wk, wv), axis=0)
else:
qkw_weight = np.concatenate((q_weight, k_weight, v_weight), axis=0)
qkv = qkv.reshape(3, config.hidden_size, config.hidden_size)
rank_tensor = shard(qkv, rank, shard_info.tp_size, axis=1)
qkv_weight = rank_tensor.reshape(-1, config.hidden_size)

model.layers[idx].attention.qkv.weight.value = np.ascontiguousarray(qkv_weight)

if qconfig.mode == QUANTIZATION_DISABLED:
model.layers[idx].attention.qkv.weight.value = np.ascontiguousarray(qkv_weight)
else:
raise NotImplementedError("quantized weights are not yet implemented")

# Common projection logic
for (src, dst, shard_axis) in [
Expand Down Expand Up @@ -181,14 +191,15 @@ def from_safetensors(
model: Module,
config: ModelConfig,
builder_config: BuilderConfig,
qconfig: QuantizationConfig,
sharding_config: ShardingConfig,
) -> Module:
if not isinstance(model, LLaMAForCausalLM):
raise ValueError(f"model has to be a derived type from LLaMAForCausalLM, got {type(model)}")

accessor = SafetensorsAccessor.from_files(paths)
adapter = cls(sharding_config)
adapter.convert(model, config, builder_config, sharding_config.rank, accessor)
adapter.convert(model, config, builder_config, qconfig, sharding_config.rank, accessor)

return model

Expand Down
35 changes: 35 additions & 0 deletions src/optimum/nvidia/runtime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Optional

from huggingface_hub import ModelHubMixin

from tensorrt_llm import ModelConfig, Mapping
from tensorrt_llm.runtime import GenerationSession


class TRTEnginePretrainedModel(ModelHubMixin):
pass


class TRTEngineForCausalLM(TRTEnginePretrainedModel):
__slots__ = ("_config", "_mapping", "_session", "_cache", "_stream")

def __init__(
self,
config: ModelConfig,
mapping: Mapping,
engine: bytes,
stream: Optional["torch.cuda.Stream"] = None,
use_cuda_graph: bool = False,
):
super().__init__()

self._config = config
self._mapping = mapping
self._stream = stream
self._session = GenerationSession(
model_config=config,
engine_buffer=engine,
mapping=mapping,
stream=stream,
cuda_graph_mode=use_cuda_graph
)
Loading

0 comments on commit 48a0b27

Please sign in to comment.