nod-ai · monorimet · Apr 11, 2024 · Dec 18, 2023 · Jan 19, 2024 · Jan 22, 2024
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -34,6 +34,7 @@ jobs:
         uses: actions/checkout@v2
 
       - name: Sync source deps
+        # build IREE from source with -DIREE_BUILD_TRACY=ON if getting tracy profile
         run: |
           python -m venv turbine_venv
           source turbine_venv/bin/activate
@@ -44,7 +45,7 @@ jobs:
           pip install -r core/pytorch-cpu-requirements.txt
           pip install --pre --upgrade -r core/requirements.txt
           pip install --pre -e core[testing]
-          pip install --pre -e models
+          pip install --pre --upgrade -e models -r models/requirements.txt
 
       - name: Show current free memory
         run: |
@@ -59,3 +60,6 @@ jobs:
         run: |
           source turbine_venv/bin/activate
           pytest models/turbine_models/tests/sd_test.py
+          pytest models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --decomp_attn True
+          pytest models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
+          pytest models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16
diff --git a/core/iree-requirements.txt b/core/iree-requirements.txt
@@ -1,2 +1,2 @@
 iree-compiler==20240403.851
-iree-runtime==20240403.851
+iree-runtime==20240403.851
diff --git a/core/shark_turbine/aot/builtins/jittable.py b/core/shark_turbine/aot/builtins/jittable.py
@@ -214,6 +214,13 @@ def flat_wrapped_f(*args):
         if "functorch_functionalize" in self._passes:
             transformed_f = functorch_functionalize(transformed_f, *flat_pytorch_args)
 
+        for node in transformed_f.graph.nodes:  # type: ignore
+            if node.op == "call_function":
+                if node.target == torch._ops.ops.aten.lift_fresh_copy.default:
+                    print(f"replaced lift_fresh_copy")
+                    node.target = torch._ops.ops.aten.clone.default
+        transformed_f.recompile()  # type: ignore
+
         # Ask dynamo to give us an aten graph.
         # TODO: Cache this for repeated calls.
         logger.debug("Performing dynamo.export(constraints=%r)", constraints)

diff --git a/core/shark_turbine/dynamo/decompositions.py b/core/shark_turbine/dynamo/decompositions.py
@@ -115,8 +115,6 @@ def _get_default_decomposition_ops() -> DecompositionOpsList:
         aten.lift_fresh_copy.default,
         aten._unsafe_index.Tensor,
         aten.unbind.int,
-        # decompositions added manually in this file
-        aten._scaled_dot_product_flash_attention.default,
     ]
 
 

diff --git a/models/requirements.txt b/models/requirements.txt
@@ -3,7 +3,7 @@ sentencepiece
 shark_turbine
 transformers==4.37.1
 accelerate
-diffusers==0.24.0
+diffusers @ git+https://github.com/nod-ai/diffusers@v0.24.0-release
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
 azure-storage-blob

diff --git a/...s/custom_models/llama-benchmark/README.md → ...s/custom_models/llama_benchmark/README.md b/...s/custom_models/llama-benchmark/README.md → ...s/custom_models/llama_benchmark/README.md
diff --git a/...tom_models/llama-benchmark/benchmark.mlir → ...tom_models/llama_benchmark/benchmark.mlir b/...tom_models/llama-benchmark/benchmark.mlir → ...tom_models/llama_benchmark/benchmark.mlir
diff --git a/...ls/llama-benchmark/benchmark_forward.mlir → ...ls/llama_benchmark/benchmark_forward.mlir b/...ls/llama-benchmark/benchmark_forward.mlir → ...ls/llama_benchmark/benchmark_forward.mlir
diff --git a/...odels/llama-benchmark/benchmark_module.py → ...odels/llama_benchmark/benchmark_module.py b/...odels/llama-benchmark/benchmark_module.py → ...odels/llama_benchmark/benchmark_module.py
diff --git a/...ma-benchmark/stateless_llama_benchmark.py → ...ma_benchmark/stateless_llama_benchmark.py b/...ma-benchmark/stateless_llama_benchmark.py → ...ma_benchmark/stateless_llama_benchmark.py
@@ -4,19 +4,17 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import sys
+import argparse
 import numpy as np
-import re
 import os
+import re
+import sys
 
 from transformers import AutoTokenizer
 from iree import runtime as ireert
+from turbine_models.utils.benchmark import benchmark_module
 import turbine_models.custom_models.stateless_llama as llama
 
-import argparse
-
-import subprocess
-from collections import namedtuple
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -71,16 +69,20 @@ def run_benchmark(args):
     input.append(temp)
     input.append(np.array(args.steps))
 
+    vmfbs = []
+    vmfbs.append(args.llama_vmfb_path)
+    vmfbs.append(args.benchmark_vmfb_path)
+
     if args.external_weight_file:
         results = benchmark_module(
             benchmark_mod,
-            args,
             "run",
+            vmfbs,
             input,
             parameters=f"model={args.external_weight_file}",
         )
     else:
-        results = benchmark_module(benchmark_mod, args, "run", input)
+        results = benchmark_module(benchmark_mod, "run", vmfbs, input)
 
     for benchmark_result in results:
         print(
@@ -146,16 +148,20 @@ def run_forward_benchmark(args):
     input.append(temp)
     input.append(np.array(args.steps))
 
+    vmfbs = []
+    vmfbs.append(args.llama_vmfb_path)
+    vmfbs.append(args.benchmark_vmfb_path)
+
     if args.external_weight_file:
         results = benchmark_module(
             benchmark_mod,
-            args,
             "run",
+            vmfbs,
             input,
             parameters=f"model={args.external_weight_file}",
         )
     else:
-        results = benchmark_module(benchmark_mod, args, "run", input)
+        results = benchmark_module(benchmark_mod, "run", vmfbs, input)
 
     for benchmark_result in results:
         print(
@@ -178,110 +184,6 @@ def run_forward_benchmark(args):
     np.dtype(np.bool_): "i1",
 }
 
-BenchmarkResult = namedtuple(
-    "BenchmarkResult", "benchmark_name time cpu_time iterations user_counters"
-)
-
-
-class BenchmarkToolError(Exception):
-    """Benchmark exception that preserves the command line and error output."""
-
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
-
-
-class BenchmarkTimeoutError(Exception):
-    """Exception raised if the benchmark is cancelled by the user specified timeout."""
-
-    pass
-
-
-def benchmark_module(
-    module, bench_args, entry_function=None, inputs=[], timeout=None, **kwargs
-):
-    funcs = [a for a in module.function_names if a != "__init"]
-    if entry_function is None:
-        if len(funcs) > 1:
-            raise ValueError(f"No function specified with multiple options {funcs}")
-        entry_function = funcs[0]
-    if entry_function not in funcs:
-        raise ValueError(
-            f"Attempted to benchmark unknown function {entry_function} of options {funcs}"
-        )
-
-    args = [ireert.benchmark_exe()]
-    args.append(f"--function={entry_function}")
-
-    for inp in inputs:
-        if isinstance(inp, str):
-            args.append(f"--input={inp}")
-            continue
-        shape = "x".join([str(d) for d in inp.shape])
-        abitype = DTYPE_TO_ABI_TYPE[inp.dtype]
-        values = inp.flatten()
-        if np.all(values[0] == values):
-            values = str(values[0])
-        else:
-            values = ",".join([str(v) for v in values])
-
-        args.append(f"--input={shape}x{abitype}={values}")
-
-    for k in kwargs:
-        v = kwargs[k]
-        args.append(f"--{k}={v}")
-
-    args.append(f"--module={bench_args.llama_vmfb_path}")
-    args.append(f"--module={bench_args.benchmark_vmfb_path}")
-
-    try:
-        benchmark_process = subprocess.run(
-            args=args,
-            # input=flatbuffer,
-            timeout=timeout,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-    except subprocess.TimeoutExpired:
-        raise BenchmarkTimeoutError(f"Benchmark timed out after {timeout} seconds")
-    out = benchmark_process.stdout
-    err = benchmark_process.stderr
-
-    err = err.decode()
-    if "INVALID_ARGUMENT;" in err:
-        raise ValueError("Invalid inputs specified for benchmarking")
-
-    # In the event benchmarking runs but encounteres an internal error,
-    # return the internal error instead of benchmark results.
-    if "INTERNAL; CUDA driver error" in str(out):
-        raise BenchmarkToolError(str(out))
-
-    # Grab individual results by line (skip header lines)
-    bench_lines = out.decode().split("\n")[3:]
-    benchmark_results = []
-    for line in bench_lines:
-        split = line.split()
-        if len(split) == 0:
-            continue
-        benchmark_name = split[0]
-        time = " ".join(split[1:3])
-        cpu_time = " ".join(split[3:5])
-        iterations = split[5]
-        user_counters = None
-        if len(split) > 5:
-            user_counters = split[6]
-        benchmark_results.append(
-            BenchmarkResult(
-                benchmark_name=benchmark_name,
-                time=time,
-                cpu_time=cpu_time,
-                iterations=iterations,
-                user_counters=user_counters,
-            )
-        )
-
-    return benchmark_results
-
 
 if __name__ == "__main__":
     args = parser.parse_args()