microsoft · jambayk · Jan 22, 2025 · Jan 16, 2025 · Jan 22, 2025
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -66,6 +66,9 @@ code = 'BLACK-ISORT'
 include_patterns = [
   '**/*.py'
 ]
+exclude_patterns = [
+  '**/hadamard_utils.py'
+]
 command = [
   'python',
   '-m',
@@ -91,6 +94,9 @@ code = 'PYLINT'
 include_patterns = [
   '**/*.py'
 ]
+exclude_patterns = [
+  '**/hadamard_utils.py'
+]
 command = [
   'python',
   '-m',

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,6 +13,7 @@ repos:
     hooks:
       - id: black
         name: Format code
+        exclude: "hadamard_utils.py"
   - repo: https://github.com/pycqa/isort
     rev: 5.11.5
     hooks:

diff --git a/docs/source/how-to/cli/cli-quantize.md b/docs/source/how-to/cli/cli-quantize.md
@@ -16,7 +16,6 @@ Some methods require a GPU and/or a calibration dataset.
 | ------ | ------------ | ------------ | ------------------ | ------------------ | ------------------- |
 | AWQ | Activation-aware Weight Quantization (AWQ) creates 4-bit quantized models and it speeds up models by 3x and reduces memory requirements by 3x compared to FP16.  | ✔️ | ❌ | PyTorch <br> HuggingFace | PyTorch |
 | GPTQ | Generative Pre-trained Transformer Quantization (GPTQ) is a one-shot weight quantization method. You can quantize your favorite language model to 8, 4, 3 or even 2 bits.  | ✔️ | ✔️  | PyTorch <br> HuggingFace |  PyTorch  |
-| QuaRot | Quantization technique that combines quantization and rotation to reduce the number of bits required to represent the weights of a model.  | ✔️ | ✔️  | HuggingFace |  PyTorch  |
 | bnb4 | Is a MatMul with weight quantized with N bits (e.g., 2, 3, 4, 5, 6, 7). | ❌ | ❌ | ONNX | ONNX |
 | ONNX Dynamic | Dynamic quantization calculates the quantization parameters (scale and zero point) for activations dynamically. | ❌ | ❌ | ONNX | ONNX |
 | INC Dynamic | Intel® Neural Compressor model compression tool.  | ❌ | ❌ | ONNX | ONNX |
@@ -43,7 +42,7 @@ olive quantize \
 
 ## Quantization with ONNX Optimizations
 
-As articulated in [Supported quantization techniques](#supported-quantization-techniques), you may wish to take the PyTorch/Hugging Face output of AWQ/GPTQ/QuaRot quantization methods and convert into an optimized ONNX format so that you can inference using the ONNX runtime.
+As articulated in [Supported quantization techniques](#supported-quantization-techniques), you may wish to take the PyTorch/Hugging Face output of AWQ/GPTQ quantization methods and convert into an optimized ONNX format so that you can inference using the ONNX runtime.
 
 You can use Olive's automatic optimizer (`auto-opt`) to create an optimized ONNX model from a quantized model:
 

diff --git a/docs/source/how-to/configure-workflows/pass/quantization-pytorch.md b/docs/source/how-to/configure-workflows/pass/quantization-pytorch.md
@@ -38,19 +38,14 @@ Please refer to [AutoAWQQuantizer](awq_quantizer) for more details about the pas
 ```
 
 ## QuaRot
-`QuaRot` is a quantization technique that combines quantization and rotation to reduce the number of bits required to represent the weights of a model. It is based on the [QuaRot paper](https://arxiv.org/abs/2305.14314).
+`QuaRot` is a technique that rotates the weights of a model to make them more conducive to quantization. It is based on the [QuaRot paper](https://arxiv.org/abs/2305.14314) but only performs offline weight rotation. Can be followed by a pass such as GPTQ to quantize the rotated model weights.
 
 This pass only supports HuggingFace transformer PyTorch models. Please refer to [QuaRot](quarot) for more details on the types of transformers models supported.
 
 ### Example Configuration
 ```json
 {
     "type": "QuaRot",
-    "w_rtn": true,
-    "rotate": true,
-    "w_bits": 4,
-    "a_bits": 4,
-    "k_bits": 4,
-    "v_bits": 4
+    "rotate_mode": "hadamard"
 }
 ```
diff --git a/examples/phi3/README.md b/examples/phi3/README.md
@@ -41,7 +41,7 @@ You can use Olive CLI command to export, fine-tune, and optimize the model for a
 olive auto-opt -m microsoft/Phi-3-mini-4k-instruct --precision int8
 
 # To quantize the model
-olive quantize -m microsoft/Phi-3-mini-4k-instruct --trust_remote_code --precision fp16 --implementation quarot
+olive quantize -m microsoft/Phi-3-mini-4k-instruct --implementation gptq
 
 # To tune ONNX session params
 olive tune-session-params -m microsoft/Phi-3-mini-4k-instruct --io_bind --enable_cuda_graph
@@ -94,17 +94,6 @@ olive run [--config CONFIGURATION_FILE]
 olive run --config phi3_run_mobile_int4.json
 ```
 
-We also introduce QuaRot, a new Quantization scheme based on Rotations, which is able to quantize LLMs end-to-end.
-Specific details about the algorithm can be found in the linked [paper](https://arxiv.org/pdf/2404.00456).
-
-## Prerequisites
-[QuaRot](https://github.com/microsoft/TransformerCompression/tree/quarot-main)
-
-To run the workflow,
-```bash
-python phi3.py --quarot
-```
-
 ### Get access to fine-tuning dataset
 Get access to the following resources on Hugging Face Hub:
 - [nampdn-ai/tiny-codes](https://huggingface.co/nampdn-ai/tiny-codes)

diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
@@ -70,11 +70,6 @@ def get_args(raw_args):
     )
 
     quant_group = parser.add_mutually_exclusive_group()
-    quant_group.add_argument(
-        "--quarot",
-        action="store_true",
-        help="Run QuaRot on a Hugging Face PyTorch model",
-    )
     quant_group.add_argument(
         "--awq",
         action="store_true",
@@ -159,9 +154,6 @@ def main(raw_args=None):
 
     olive_run(run_config)
 
-    if args.quarot:
-        return
-
     if args.inference:
         if not args.chat_template:
             args.chat_template = (
@@ -211,17 +203,6 @@ def generate_config(args):
 
     config_prefix = "phi3_run_"
 
-    if args.quarot:
-        template_json = use_passes(template_json, "quarot")
-        template_json["systems"]["local_system"]["accelerators"] = [
-            {"device": "GPU", "execution_providers": ["CUDAExecutionProvider"]}
-        ]
-        new_json_file = f"{config_prefix}quarot.json"
-        with open(new_json_file, "w") as f:
-            json.dump(template_json, f, indent=4)
-
-        return new_json_file
-
     # use aml instance of model
     if args.source == "AzureML":
         template_json["input_model"]["model_path"] = AML_MODEL_Path

diff --git a/examples/phi3/phi3_template.json b/examples/phi3/phi3_template.json
@@ -103,16 +103,6 @@
             "execution_providers_list": [ "CUDAExecutionProvider" ],
             "opt_level_list": [ 0, 1 ],
             "execution_mode_list": [ 0, 1 ]
-        },
-        "quarot": {
-            "type": "QuaRot",
-            "w_rtn": true,
-            "rotate": true,
-            "w_bits": 4,
-            "a_bits": 4,
-            "k_bits": 4,
-            "v_bits": 4,
-            "calibration_data_config": "wikitext2_train"
         }
     },
     "pass_flows": [ [ "<place_holder>" ] ],

diff --git a/olive/cli/quantize.py b/olive/cli/quantize.py
@@ -69,9 +69,6 @@ def register_subcommand(parser: ArgumentParser):
             action="store_true",
             help="Use QDQ encoding in ONNX model for the quantized nodes.",
         )
-        sub_parser.add_argument(
-            "--quarot_rotate", action="store_true", help="Apply QuaRot/Hadamard rotation to the model."
-        )
 
         add_dataset_options(sub_parser, required=False, include_train=False, include_eval=False)
         add_remote_options(sub_parser)
@@ -96,9 +93,7 @@ def _get_run_config(self, tempdir: str) -> Dict[str, Any]:
         if not self.args.precision:
             self.args.precision = ALGORITHMS[self.args.algorithm][defaults_key]["precision"]
 
-        if self.args.algorithm in ["gptq", "rtn"] and self.args.implementation == "quarot":
-            self.args.precision = "int16"
-        elif self.args.algorithm == "rtn" and self.args.precision == "nf4":
+        if self.args.algorithm == "rtn" and self.args.precision == "nf4":
             self.args.implementation = "bnb4"
 
         if self.args.enable_qdq_encoding and self.args.implementation != "matmul4":
@@ -130,10 +125,6 @@ def _get_run_config(self, tempdir: str) -> Dict[str, Any]:
             (("passes", "awq", "w_bit"), precision),
             (("passes", "gptq", "bits"), precision),
             (("passes", "bnb4", "quant_type"), precision),
-            (("passes", "quarot", "w_bits"), precision),
-            (("passes", "quarot", "rotate"), self.args.quarot_rotate),
-            (("passes", "quarot", "w_rtn"), self.args.algorithm == "rtn"),
-            (("passes", "quarot", "w_gptq"), self.args.algorithm == "gptq"),
             (("passes", "nvmo", "precision"), precision),
             (("passes", "nvmo", "algorithm"), self.args.algorithm.upper()),
             (("passes", "onnx_dynamic", "weight_type"), precision),
@@ -154,9 +145,6 @@ def run(self):
         if ("gptq" in self.args.algorithm) and (not self.args.data_name):
             raise ValueError("data_name is required to use gptq.")
 
-        if ("quarot" in self.args.algorithm) and (not self.args.data_name) and (self.args.quarot_strategy == "gptq"):
-            raise ValueError("data_name is required to quantize weights using gptq.")
-
         with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir:
             run_config = self._get_run_config(tempdir)
             olive_run(run_config)
@@ -184,14 +172,6 @@ def run(self):
         # Pytorch algorithms
         "awq": {"type": "AutoAWQQuantizer", "w_bit": 4},
         "gptq": {"type": "GptqQuantizer", "bits": 4, "data_config": "default_data_config"},
-        "quarot": {
-            "type": "QuaRot",
-            "w_bits": 16,
-            "w_rtn": False,
-            "w_gptq": False,
-            "rotate": False,
-            "calibration_data_config": "default_data_config",
-        },
         # Onnx algorithms
         "bnb4": {"type": "OnnxBnb4Quantization", "quant_type": "nf4"},
         "matmul4": {"type": "OnnxMatMul4Quantizer", "accuracy_level": 4},
@@ -219,14 +199,14 @@ def run(self):
         "description": "(HfModel, OnnxModel) WOQ with AWQ.",
     },
     "gptq": {
-        "implementations": ["gptq", "quarot", "matmul4", "inc_static", "inc_dynamic"],
+        "implementations": ["gptq", "matmul4", "inc_static", "inc_dynamic"],
         "hf_model_defaults": {"implementation": "gptq", "precision": "int4"},
         "onnx_model_defaults": {"implementation": "matmul4", "precision": "int4"},
         "description": "(HfModel, OnnxModel) WOQ with GPTQ.",
     },
     "rtn": {
-        "implementations": ["quarot", "bnb4", "matmul4"],
-        "hf_model_defaults": {"implementation": "quarot", "precision": "int16"},
+        "implementations": ["bnb4", "matmul4"],
+        "hf_model_defaults": {"implementation": None, "precision": None},
         "onnx_model_defaults": {"implementation": "onnx_static", "precision": "int8"},
         "description": "(HfModel, OnnxModel) WOQ with RTN.",
     },
@@ -275,18 +255,6 @@ def run(self):
             "uint16": 16,
         },
     },
-    "quarot": {
-        "name": "QuaRot/Hadamard rotation",
-        "supported_precisions": [],
-        "precision_mapping": {
-            "int4": 4,
-            "int8": 8,
-            "int16": 16,
-            "uint4": 4,
-            "uint8": 8,
-            "uint16": 16,
-        },
-    },
     "bnb4": {
         "name": "Bits-n-Bytes",
         "supported_precisions": ["fp4", "nf4"],

diff --git a/olive/common/utils.py b/olive/common/utils.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import codecs
+import gc
 import hashlib
 import inspect
 import io
@@ -659,3 +660,12 @@ def load_weights(path: Union[str, Path], file_format: Optional[WeightsFileFormat
 def unescaped_str(arg_str):
     """Decode strings without escaping."""
     return codecs.decode(arg_str, "unicode_escape")
+
+
+def cleanup_memory():
+    """Cleanup memory by running garbage collection and emptying CUDA cache."""
+    import torch
+
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
diff --git a/olive/olive_config.json b/olive/olive_config.json
@@ -285,6 +285,12 @@
             "supported_precisions": [ "*" ],
             "module_dependencies": [ "pytorch-lightning" ]
         },
+        "QuaRot": {
+            "module_path": "olive.passes.pytorch.rotate.QuaRot",
+            "supported_providers": [ "*" ],
+            "supported_accelerators": [ "*" ],
+            "supported_precisions": [ "*" ]
+        },
         "SparseGPT": {
             "module_path": "olive.passes.pytorch.sparsegpt.SparseGPT",
             "supported_providers": [ "*" ],
@@ -297,13 +303,6 @@
             "supported_accelerators": [ "*" ],
             "supported_precisions": [ "*" ]
         },
-        "QuaRot": {
-            "module_path": "olive.passes.pytorch.quarot.QuaRot",
-            "supported_providers": [ "CPUExecutionProvider" ],
-            "supported_accelerators": [ "cpu" ],
-            "supported_precisions": [ "int4", "int8", "int16", "uint4", "uint8", "uint16" ],
-            "extra_dependencies": [ "flash-attn" ]
-        },
         "TorchTRTConversion": {
             "module_path": "olive.passes.pytorch.torch_trt_conversion.TorchTRTConversion",
             "supported_providers": [ "*" ],