feat(tests) : Update CI to use new workflow and silicon. (#145)

* Update CI to use new workflow and silicon. * remove third-party trtllm * Pin TensorRT-LLM version * Pin TensorRT-LLM to first 0.12 beta version * Ensure torch dependency constraints match TRTLLM * Remove quantization for now * Allow restoring from local dir * build all ranks * Improve checkpoints and engines discovery * use DirEntry.name in the lambda to retrieve filepath * quality * Update checkpoint detection regex * add logic to reload from local directory if model_id is a local transformers save_pretrained model path * quality * make sure to create the sharding config * move to latest trtllm version * remove bad_words for now * in fact its now called bad_token_ids * move on to GenerationExecutor, removing tokenizer param * specify constructor call order for models * specify local workspace for converting local model * fix wrong way of symlinking files and folder in save_pretrained * detect model_type from engine config if applicable * remove deprecated test_config.py * handled dtype from prebuilt engine * handled model_type from different config format * added test for model_type_from_known_config * quality * fix wrong ref name in test_hub * More robust way to load artifacts * fix save_pretrained * fix missing branch for raise in model type inference * quality
huggingface · Jul 24, 2024 · fa8556c · fa8556c
1 parent 011b5a9
commit fa8556c
Show file tree

Hide file tree

Showing 17 changed files with 314 additions and 275 deletions.
diff --git a/.github/workflows/pr_functional_tests.yml b/.github/workflows/pr_functional_tests.yml
diff --git a/.github/workflows/pr_integration_tests.yml → .github/workflows/pr_tests.yml b/.github/workflows/pr_integration_tests.yml → .github/workflows/pr_tests.yml
@@ -14,9 +14,6 @@ concurrency:
 
 env:
   OPTIMUM_NVIDIA_IS_CI: ON
-  RUN_CPU_ONLY: OFF
-  RUN_NIGHTLY: OFF
-  RUN_SLOW: ON
   HF_HUB_ENABLE_HF_TRANSFER: ON
 
 jobs:
@@ -25,12 +22,13 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - name: GPU-enabled Optimum-Nvidia Integration Test Suite
-            image: nvidia/cuda:12.1.0-devel-ubuntu22.04
-        cuda_arch: [sm_86, sm_89]
+          - name: GPU-enabled Optimum-Nvidia Test Suite
+            image: nvidia/cuda:12.4.1-devel-ubuntu22.04
+        gpu_target: ["nvidia-multi-gpu-l4-runners", "nvidia-multi-gpu-a10-runners"]
 
     name: ${{ matrix.config.name }}
-    runs-on: [ci, nvidia-gpu, multi-gpu, "${{ matrix.cuda_arch }}"]
+    runs-on:
+      group: "${{matrix.gpu_target}}"
 
     container:
       image: ${{ matrix.config.image }}
@@ -65,6 +63,6 @@ jobs:
         run: |
           python -c "from tensorrt_llm import __version__; print(__version__)"
 
-      - name: Run optimum-nvidia integration test-suite
+      - name: Run optimum-nvidia test-suite
         run: |
-          pytest -s -vvvvv -n 1 -p no:warnings -o log_cli=true tests/integration/
+          pytest -s -vvvvv -n 4 -p no:warnings -o log_cli=true --ignore=tests/integration tests/
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "third-party/tensorrt-llm"]
-	path = third-party/tensorrt-llm
-	url = https://github.com/nvidia/tensorrt-llm

diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@ Optimum-NVIDIA
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://huggingface.co/docs/optimum/index)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31013/)
-[![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.9.0-green)](https://github.com/nvidia/tensorrt-llm)
+[![cuda](https://img.shields.io/badge/cuda-12.4-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.11.0.dev2024070200-green)](https://github.com/nvidia/tensorrt-llm)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 ---

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,8 +33,8 @@ dependencies = [
     "onnx >= 1.12.0",
     "optimum >= 1.13.0",
     "setuptools",
-    "tensorrt-llm > 0.10.0",
-    "torch>=2.2.0a,<=2.3.0a",
+    "tensorrt-llm == 0.12.0.dev2024072300",
+    "torch>=2.3.0a,<=2.4.0a",
     "transformers >= 4.38.2",
     "pynvml"
 ]

diff --git a/setup.py b/setup.py
@@ -36,8 +36,8 @@
     "onnx >= 1.12.0",
     "optimum >= 1.13.0",
     "setuptools",
-    "tensorrt-llm > 0.10.0",
-    "torch>=2.2.0a,<=2.3.0a",
+    "tensorrt-llm == 0.12.0.dev2024072300",
+    "torch>=2.3.0a,<=2.4.0a",
     "transformers >= 4.38.2",
     "pynvml"
 ]

diff --git a/src/optimum/nvidia/export/config.py b/src/optimum/nvidia/export/config.py
@@ -59,12 +59,17 @@ def from_config(
         max_input_len = config.max_position_embeddings
         max_output_len = config.max_position_embeddings
 
-        return ExportConfig(
+        econfig = ExportConfig(
             dtype=dtype,
             max_input_len=max_input_len,
             max_output_len=max_output_len,
             max_batch_size=max_batch_size,
-        ).validate()
+        )
+
+        # Initialize sharing information with single shard
+        econfig.with_sharding()
+        econfig.validate()
+        return econfig
 
     def validate(self) -> "ExportConfig":
         if self.optimization_level < 0: