Make overall optimum-nvidia pip installable (#83)

* Add dependencies on tensorrt-llm and additional nvidia repository * Attempt to have pip working with manually building the index url * make `optimum-nvidia` pip installable * Update setup.py to latest requirements * Let's remove tensorrtllm dependency for now to avoid transformers conflict * pip install optimum-nvidia in the containers * Ping huggingface_hub to 0.20.0 * Quality
huggingface · Feb 28, 2024 · 816ec44 · 816ec44
1 parent 8119ca4
commit 816ec44
Show file tree

Hide file tree

Showing 8 changed files with 112 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -6,10 +6,9 @@ Optimum-NVIDIA
 <h4> Optimized inference with NVIDIA and Hugging Face </h4>
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://huggingface.co/docs/optimum/index)
-[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
+[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31013/)
 [![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.6.1-green)](https://github.com/nvidia/tensorrt-llm)
-[![version](https://img.shields.io/badge/release-0.1.0-green)]()
+[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.9.0-green)](https://github.com/nvidia/tensorrt-llm)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 ---
@@ -85,10 +84,10 @@ model = AutoModelForCausalLM.from_pretrained(
 model_inputs = tokenizer(["How is autonomous vehicle technology transforming the future of transportation and urban planning?"], return_tensors="pt").to("cuda")
 
 generated_ids = model.generate(
-                    **model_inputs, 
-                    top_k=40, 
-                    top_p=0.7, 
-                    repetition_penalty=10,
+    **model_inputs, 
+    top_k=40, 
+    top_p=0.7, 
+    repetition_penalty=10,
 )
 
 tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -99,8 +98,7 @@ To learn more about text generation with LLMs, check out [this guide](https://hu
 <!-- For more details, read our [documentation](https://huggingface.com/docs/optimum/nvidia/index). -->
 
 # Support Matrix
-We test Optimum-NVIDIA on 4090, L40S, and H100 Tensor Core GPUs, though it is expected to work on any GPU based on the following architectures:
-* Volta
+We test Optimum-NVIDIA on 4090, L40S, and H100 Tensor Core GPUs, though it is expected to work on any GPU based on the following architectures: 
 * Turing (with experimental support for T4 / RTX Quadro x000)
 * Ampere (A100/A30 are supported. Experimental support for A10, A40, RTX Ax000)
 * Hopper
@@ -116,7 +114,9 @@ Optimum-NVIDIA currently accelerates text-generation with LLaMAForCausalLM, and
 
 | Model             | Tasks           |
 | :----             | :----           |
-| LLaMAForCausalLM  | TextGeneration  |
+| Gemma             | TextGeneration  |
+| Llama             | TextGeneration  |
+| Mistral           | TextGeneration  |
 | Additional Models | Coming soon     | -->
 
 # Contributing

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,23 +1,16 @@
 FROM tensorrt_llm/release:latest
 
-# 70 = V100
 # 75 = T4/RTX Quadro
 # 80 = A100/A30
 # 86 = A10/A40/RTX Axxx
 # 89 = L4/L40/L40s/RTX Ada/4090
 # 90 = H100/H200
-#ARG TARGET_CUDA_ARCHS="70-real;75-real;80-real;86-real;89-real;90-real"
+#ARG TARGET_CUDA_ARCHS="75-real;80-real;86-real;89-real;90-real"
 
 COPY . /opt/optimum-nvidia
 
-# Install tensorrt-llm
-# TODO: Reduce the container size removing build artifacts
-WORKDIR /opt/optimum-nvidia/third-party/tensorrt-llm
-
 # Install dependencies
-RUN python -m pip install --upgrade --no-cache-dir datasets huggingface_hub hf-transfer optimum transformers pynvml
-ENV PYTHONPATH=/opt/optimum-nvidia/src:$PYTHONPATH
+RUN python -m pip install /opt/optimum-nvidia
 
-#Add the project sources to the final layer
-COPY . /opt/optimum-nvidia
+# Let's put our users in the examples folder
 WORKDIR /opt/optimum-nvidia/examples
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
@@ -1,4 +1,16 @@
-FROM tensorrt_llm:latest
+FROM tensorrt_llm/devel:latest
+
+# 75 = T4/RTX Quadro
+# 80 = A100/A30
+# 86 = A10/A40/RTX Axxx
+# 89 = L4/L40/L40s/RTX Ada/4090
+# 90 = H100/H200
+#ARG TARGET_CUDA_ARCHS="75-real;80-real;86-real;89-real;90-real"
+
+COPY . /opt/optimum-nvidia
 
 # Install dependencies
-RUN python -m pip install --no-cache-dir --upgrade accelerate datasets huggingface_hub hf-transfer optimum transformers pynvml
+RUN python -m pip install /opt/optimum-nvidia
+
+# Let's put our users in the examples folder
+WORKDIR /opt/optimum-nvidia/examples
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,57 @@
+[project]
+name = "optimum-nvidia"
+requires-python = ">= 3.10"
+dynamic = ["version"]
+description =  """Optimum Nvidia is the interface between the Hugging Face Transformers and NVIDIA GPUs. "
+"It provides a set of tools enabling easy model loading, training and "
+"inference on single and multiple GPU cards for different downstream tasks."""
+
+# Some contact information
+authors = [{name = "HuggingFace Inc. Machine Learning Optimization Team", email = "[email protected]"}]
+keywords = ["transformers", "neural-network", "inference", "nvidia", "tensorrt", "ampere", "hopper"]
+readme = "README.md"
+license = {text = "Apache/2.0", file="LICENSE"}
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3.10",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+
+[project.urls]
+Homepage = "https://huggingface.co/hardware/nvidia"
+Repository = "https://github.com/huggingface/optimum-nvidia"
+Issues = "https://github.com/huggingface/optimum-nvidia/issues"
+
+# List dependencies
+dependencies = [
+    "accelerate",
+    "dataset",
+    "huggingface_hub > 0.17.0, < 0.21.0",
+    "hf-transfer",
+    "numpy >= 1.22.0",
+    "onnx >= 1.12.0",
+    "optimum >= 1.13.0",
+    "setuptools",
+    "transformers >= 4.38.1",
+#    "tensorrt_llm",
+    "pynvml"
+]
+
+# List additional dependencies
+[project.optional-dependencies]
+test = ["pytest", "psutil", "parameterized", "datasets", "safetensors",]
+quality = ["black", "ruff", "isort", "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git",]
+
+# Configure build system
+[build-system]
+requires = ["setuptools"]
+
+# Configure tools around
 [tool.ruff]
 # Never enforce `E501` (line length violations).
 ignore = ["C901", "E501", "E741", "F402", "F823" ]

diff --git a/setup.cfg b/setup.cfg
@@ -12,6 +12,13 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+[options]
+package_dir =
+    =src
+packages = find_namespace:
+
+[options.packages.find]
+where = src
 
 [isort]
 default_section = FIRSTPARTY
@@ -25,8 +32,12 @@ multi_line_output = 3
 use_parentheses = True
 
 [flake8]
-ignore = E203, E501, E741, W503, W605
+ignore = E203, E501, E741, W503, W605, F401
 max-line-length = 119
 
 [tool:pytest]
-doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
+
+doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
+
+[tool:ruff:lint]
+ignore-init-module-imports = true
diff --git a/setup.py b/setup.py
@@ -14,9 +14,9 @@
 #  limitations under the License.
 import re
 from distutils.core import setup
-
+from platform import system, machine, python_version
 from setuptools import find_namespace_packages
-
+from sys import version_info as pyversion
 
 # Ensure we match the version set in optimum/nvidia/version.py
 filepath = "src/optimum/nvidia/version.py"
@@ -29,13 +29,15 @@
 INSTALL_REQUIRES = [
     "accelerate",
     "dataset",
-    "fsspec",
-    "huggingface_hub >= 0.14.0",
+    "huggingface_hub == 0.20.0",
     "hf-transfer",
-    "numpy >= 1.22.0",
+    "numpy >= 1.26.0",
     "onnx >= 1.12.0",
     "optimum >= 1.13.0",
+    "setuptools",
     "transformers >= 4.38.1",
+    # "tensorrt-llm == 0.9.0dev2024022000",
+    # "nvidia-ammo >= 0.7.0",
     "pynvml"
 ]
 
@@ -55,17 +57,9 @@
 ]
 
 
-QUANTIZATION_REQUIRES = [
-    "ammo"  # This one is a bit harder to install ...
-    "datasets"
-    "transformers",
-    "torch",
-]
-
 EXTRAS_REQUIRE = {
     "tests": TESTS_REQUIRES,
     "quality": QUALITY_REQUIRES,
-    "quantization": QUANTIZATION_REQUIRES,
 }
 
 setup(
@@ -85,7 +79,6 @@
         "Intended Audience :: Education",
         "Intended Audience :: Science/Research",
         "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
@@ -97,6 +90,7 @@
     packages=find_namespace_packages(include=["optimum*"]),
     install_requires=INSTALL_REQUIRES,
     extras_require=EXTRAS_REQUIRE,
+    dependency_links=["https://pypi.nvidia.com"],
     include_package_data=True,
     zip_safe=False,
 )
diff --git a/src/optimum/nvidia/models/gemma.py b/src/optimum/nvidia/models/gemma.py
@@ -64,8 +64,8 @@ def load_from_hf_gemma(
 
     model_params = dict(hf_gemma.named_parameters())
     # concatenate, duplicate and reshape q, k, v -> qkv
-    for l in range(hf_gemma.config.num_hidden_layers):
-        prefix = f"model.layers.{l}.self_attn."
+    for layer in range(hf_gemma.config.num_hidden_layers):
+        prefix = f"model.layers.{layer}.self_attn."
         q_weight = model_params[prefix + "q_proj.weight"]
         k_weight = model_params[prefix + "k_proj.weight"]
         v_weight = model_params[prefix + "v_proj.weight"]
@@ -90,31 +90,31 @@ def load_from_hf_gemma(
         tensorrt_llm_llama.config.moe_normalization_mode,
     )
     # concatenate MoE gated activations & stack experts
-    for l in range(hf_gemma.config.num_hidden_layers):
+    for layer in range(hf_gemma.config.num_hidden_layers):
         if not moe_config.has_moe():
             continue
 
         rank_experts = list(range(moe_config.num_experts))
         if moe_config.tp_mode == moe_config.ParallelismMode.EXPERT_PARALLEL:
             rank_experts = mapping.ep_experts(moe_config.num_experts)
         for suffix in ["w1", "w2", "w3"]:
-            model_params[f"model.layers.{l}.block_sparse_moe.experts.{suffix}.weight"] = torch.stack(
+            model_params[f"model.layers.{layer}.block_sparse_moe.experts.{suffix}.weight"] = torch.stack(
                 [
-                    model_params[f"model.layers.{l}.block_sparse_moe.experts.{expert}.{suffix}.weight"]
+                    model_params[f"model.layers.{layer}.block_sparse_moe.experts.{expert}.{suffix}.weight"]
                     for expert in rank_experts
                 ]
             )
 
-        w3 = model_params[f"model.layers.{l}.block_sparse_moe.experts.w3.weight"]
-        w2 = model_params[f"model.layers.{l}.block_sparse_moe.experts.w2.weight"]
-        w1 = model_params[f"model.layers.{l}.block_sparse_moe.experts.w1.weight"]
+        w3 = model_params[f"model.layers.{layer}.block_sparse_moe.experts.w3.weight"]
+        w2 = model_params[f"model.layers.{layer}.block_sparse_moe.experts.w2.weight"]
+        w1 = model_params[f"model.layers.{layer}.block_sparse_moe.experts.w1.weight"]
         if moe_config.tp_mode == moe_config.ParallelismMode.TENSOR_PARALLEL:
             w3 = split(w3, mapping.tp_size, mapping.tp_rank, dim=1)
             w2 = split(w2, mapping.tp_size, mapping.tp_rank, dim=2)
             w1 = split(w1, mapping.tp_size, mapping.tp_rank, dim=1)
         # concat w3 and w1 for gated expert
-        model_params[f"model.layers.{l}.block_sparse_moe.experts.w3w1.weight"] = torch.concat([w3, w1], dim=-2)
-        model_params[f"model.layers.{l}.block_sparse_moe.experts.w2.weight"] = w2
+        model_params[f"model.layers.{layer}.block_sparse_moe.experts.w3w1.weight"] = torch.concat([w3, w1], dim=-2)
+        model_params[f"model.layers.{layer}.block_sparse_moe.experts.w2.weight"] = w2
 
     torch_dtype = str_dtype_to_torch(dtype)
     layers_range = mapping.pp_layers(hf_gemma.config.num_hidden_layers)

diff --git a/src/optimum/nvidia/runtime.py b/src/optimum/nvidia/runtime.py
@@ -25,9 +25,6 @@
 from huggingface_hub import ModelHubMixin
 
 
-# from optimum.nvidia.builder import TensorRTEngineBuilder
-
-
 LOGGER = getLogger(__name__)
 
 PackedTensor = List[torch.Tensor]