Make float8 quantization back in the game. (#92)

* Make float8 quantization back in the game. * Expose max_sequence_length parameter for the calibration dataset * When doing quantization, let's not try to reconvert the model right after * Use correct sharding / offloading for model not fitting the local GPUs for quantization * Expose the batch_size to use for calibration defaulting to 1 * Put the pre/post process in the right order * Change the order export and calibration happen * Let's make sure to clean up the HF model in all the case * Add some float8 quantization test * Let's use c4-new dataset * Fix wrong str identifier around variable name * Always specify the lm_head quantization schema * Let's use cartesian product of the parameters * Quality * Expose device parameter for target datasets * quality * Adapt the quality workflow to match with local * missing pip install command * Make sure to use the right python version for quality * Use direct dependency rather than extras to avoid huge downloads * Quality * Add end-to-end float8 calibration flow * Let's split functional / integration tests * Rename workflow titles * Quality * Some more renaming in workflows * Let's create a temporary folder for unittest * Fix invalid hf model creation from auto factory with config * Fix more issue with invalid layer size * Once again ... * Reintroduce use_fp8 * Quality * Let's make a smaller model and ensure the config values stay in integer repr * Let's save the tokenizer all with the model for the tests * Change wording * Update dependency huggingface_hub with right naming * One last dependencies update * Let's make sure we can serialize the qconfig * Do not serialize the calibration datasets * Force the config to be forwarded * let's uninstall optimum-nvidia from the container * Again * Increase share memory for workflows * Limit concurrency for integration tests * Increase verbosity for now to debug * Update CI image * Update tests concurrency * Add some more logging to dig * Fix tqdm import * Reduce workload for quantization tests * Update huggingface-hub with the config fix * Once more * Remove debugging print statement * Quality * Let's just remove all layer and use a single * Attempt to give more info in case of failure * Let's use a bit more samples to quantize * Added utility to skip test if sm is not meet * Increase shm and use tmpfs for tests * Once more * Pin huggingface-hub main version * Quality * Preinstall dependencies for optimum-nvidia in the dev container * Let's relax testing for loading from the hub if no revision found for underlying hardware * quality * Disable gemma-2b testing for now * Let's follow ModelHubMixin guidelines * Retrieve the device name in parts * Let's raise an issue if the config is None * Quality
huggingface · Mar 20, 2024 · 22a3a3a · 22a3a3a
1 parent 97446a8
commit 22a3a3a
Show file tree

Hide file tree

Showing 31 changed files with 1,231 additions and 413 deletions.
diff --git a/.github/workflows/pr_tests.yml → .github/workflows/pr_functional_tests.yml b/.github/workflows/pr_tests.yml → .github/workflows/pr_functional_tests.yml
@@ -1,4 +1,4 @@
-name: Test Suite on PRs
+name: GPU-Enabled Integration Test on PRs
 
 on:
   pull_request:
@@ -22,19 +22,18 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - name: Fast Optimum-Nvidia Test Suite
+          - name: GPU-Enabled Optimum-Nvidia Functional Test Suite
             runner: [ci, nvidia-gpu]
-            image: huggingface/optimum-nvidia:devel
+            image: huggingface/optimum-nvidia:ci
             report: dev
-#        cuda_arch: [sm_86, sm_89]
-        cuda_arch: [sm_89]
+        cuda_arch: [sm_86, sm_89]
 
     name: ${{ matrix.config.name }}
     runs-on: [ci, nvidia-gpu, multi-gpu, "${{ matrix.cuda_arch }}"]
 
     container:
       image: ${{ matrix.config.image }}
-      options: --shm-size "16gb" --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
       env:
         HF_TOKEN: ${{ secrets.OPTIMUM_NVIDIA_HUB_READ_TOKEN }}
 
@@ -48,10 +47,14 @@ jobs:
         with:
           fetch-depth: 1
 
+      - name: Uninstall optimum-nvidia before doing anything
+        run: |
+          python -m pip uninstall -y optimum-nvidia
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade -e .[quality,tests]
 
-      - name: Run fast optimum-nvidia CPU tests
+      - name: Run optimum-nvidia functional test-suite
         run: |
-          python -m pytest -n 4 -s -v -p no:warnings tests
+          python -m pytest -n 4 -s -v -p no:warnings --ignore=tests/integration/ tests/
diff --git a/.github/workflows/pr_slow_tests.yml → .github/workflows/pr_integration_tests.yml b/.github/workflows/pr_slow_tests.yml → .github/workflows/pr_integration_tests.yml
@@ -1,4 +1,4 @@
-name: GPU Enabled Test Suite on PRs
+name: GPU Enabled Functional Test on PRs
 
 on:
   pull_request:
@@ -25,17 +25,16 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - name: GPU-enabled Optimum-Nvidia Test Suite
-            image: huggingface/optimum-nvidia:devel
-#        cuda_arch: [sm_86, sm_89]
-        cuda_arch: [sm_89]
+          - name: GPU-enabled Optimum-Nvidia Integration Test Suite
+            image: huggingface/optimum-nvidia:ci
+        cuda_arch: [sm_86, sm_89]
 
     name: ${{ matrix.config.name }}
     runs-on: [ci, nvidia-gpu, multi-gpu, "${{ matrix.cuda_arch }}"]
 
     container:
       image: ${{ matrix.config.image }}
-      options: --shm-size "16gb" --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
       env:
         HF_TOKEN: ${{ secrets.OPTIMUM_NVIDIA_HUB_READ_TOKEN }}
 
@@ -49,10 +48,14 @@ jobs:
         with:
           fetch-depth: 1
 
+      - name: Uninstall optimum-nvidia before doing anything
+        run: |
+          python -m pip uninstall -y optimum-nvidia
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade -e .[quality,tests]
 
-      - name: Run fast optimum-nvidia GPU tests
+      - name: Run optimum-nvidia integration test-suite
         run: |
-          python -m pytest -s -v -p no:warnings tests
+          python -m pytest -s -v -n 1 -p no:warnings tests/integration/
diff --git a/.github/workflows/pr_quality.yml b/.github/workflows/pr_quality.yml
@@ -20,12 +20,10 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.9"
+          python-version: "3.10"
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
           python -m pip install black ruff isort
       - name: Check quality
         run: |
-          ruff check examples tests src scripts
-          ruff format examples tests src scripts --check
+          make quality
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
@@ -8,9 +8,8 @@ FROM tensorrt_llm/devel:latest
 #ARG TARGET_CUDA_ARCHS="75-real;80-real;86-real;89-real;90-real"
 
 COPY . /opt/optimum-nvidia
+WORKDIR /opt/optimum-nvidia
 
-# Install dependencies
-RUN python -m pip install /opt/optimum-nvidia
-
-# Let's put our users in the examples folder
-WORKDIR /opt/optimum-nvidia/examples
+RUN pip install -e '.[quality, tests]' && \
+    pip uninstall -y optimum-nvidia && \
+    rm -rf /opt/optimum-nvidia
diff --git a/docs/source/quantization.md b/docs/source/quantization.md
diff --git a/examples/quantization.py b/examples/quantization.py
@@ -0,0 +1,102 @@
+#  coding=utf-8
+#  Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      http://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from argparse import ArgumentParser
+from logging import getLogger
+from pathlib import Path
+
+from transformers import AutoTokenizer
+
+from optimum.nvidia import AutoModelForCausalLM, setup_logging
+from optimum.nvidia.quantization import AutoQuantizationConfig
+
+
+# Setup logging needs to happen before importing TRT ...
+setup_logging(True)
+
+from optimum.nvidia.utils.cli import (
+    postprocess_quantization_parameters,
+    register_common_model_topology_args,
+    register_optimization_profiles_args,
+    register_quantization_args,
+)
+
+
+LOGGER = getLogger(__name__)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("🤗 Optimum-Nvidia Custom Quantization Example")
+    parser.add_argument(
+        "--hub-token",
+        type=str,
+        help="Hugging Face Hub Token to retrieve private weights.",
+    )
+    register_common_model_topology_args(parser)
+    register_optimization_profiles_args(parser)
+    register_quantization_args(parser)  # Inject params.quantization_config
+
+    parser.add_argument("model", type=str, help="The model's id or path to use.")
+    parser.add_argument(
+        "output", type=Path, help="Path to store generated TensorRT engine."
+    )
+    args = parser.parse_args()
+    args = postprocess_quantization_parameters(args)
+
+    if args.hub_token is not None:
+        from huggingface_hub import login
+
+        login(args.hub_token)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # Quantization Config
+    qconfig = AutoQuantizationConfig.from_description(
+        weight="float8",
+        activation="float8",
+        tokenizer=tokenizer,
+        dataset="c4-new",
+        max_sequence_length=args.max_prompt_length,
+        num_samples=1024,
+    )
+
+    # Create the model
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        max_batch_size=args.max_batch_size,
+        max_prompt_length=args.max_prompt_length,
+        num_beams=args.max_beam_width,
+        quantization_config=qconfig,
+    )
+    model.save_pretrained(args.output)
+
+    prompt = "What is the latest generation of Nvidia GPUs?"
+    tokens = tokenizer(prompt, padding=True, return_tensors="pt")
+    generated, lengths = model.generate(
+        **tokens,
+        top_k=40,
+        top_p=0.95,
+        repetition_penalty=10,
+        pad_token_id=tokenizer.eos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=256,
+    )
+
+    generated_text = tokenizer.batch_decode(
+        generated.flatten(0, 1), skip_special_tokens=True
+    )
+    print(generated_text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,10 +25,10 @@ classifiers = [
 # List dependencies
 dependencies = [
     "accelerate",
-    "dataset",
-    "huggingface_hub > 0.17.0, < 0.21.0",
+    "datasets >= 2.14.0",
+    "huggingface-hub @ git+https://github.com/huggingface/huggingface_hub@45147c518ad3c1f70ecb462de4bf23cd553ba54b",
     "hf-transfer",
-    "numpy >= 1.22.0",
+    "numpy >= 1.26.0",
     "onnx >= 1.12.0",
     "optimum >= 1.13.0",
     "setuptools",

diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRES = [
     "accelerate",
     "datasets >= 2.14",
-    "huggingface_hub >= 0.21.0",
+    "huggingface-hub @ git+https://github.com/huggingface/huggingface_hub@45147c518ad3c1f70ecb462de4bf23cd553ba54b",
     "hf-transfer",
     "numpy >= 1.26.0",
     "onnx >= 1.12.0",
@@ -38,6 +38,7 @@
     "transformers >= 4.38.1",
     # "tensorrt-llm == 0.9.0dev2024022000",
     # "nvidia-ammo >= 0.7.0",
+    "torch < 2.2.0",
     "pynvml"
 ]
 

diff --git a/src/optimum/nvidia/__init__.py b/src/optimum/nvidia/__init__.py
@@ -14,7 +14,6 @@
 #  limitations under the License.
 
 from .config import TensorRTConfig
-from .lang import DataType
 from .logging import DEFAULT_LOGGING_FMT, setup_logging
 from .models import AutoModelForCausalLM
 from .pipelines import pipeline