From b72ebfb37d39b0ddd32e56a6ac4d9e9cc2c8f991 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 17 Apr 2024 00:27:17 -0500
Subject: [PATCH 001/174] Small fixes to SDXL compilation and SD unet script
 fix.

---
 .../custom_models/sd_inference/unet.py        |   9 +-
 .../custom_models/sd_inference/utils.py       |   4 +-
 .../default_mfma_attn_spec.mlir               | 655 ++----------------
 .../sdxl_inference/sdxl_prompt_encoder.py     |   2 +-
 .../custom_models/sdxl_inference/unet.py      |   2 +-
 5 files changed, 57 insertions(+), 615 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 18657ae86..21ee83327 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -190,7 +190,8 @@ def main(
         args.iree_target_triple,
         args.vulkan_max_allocation,
     )
-    safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
-    with open(f"{safe_name}.mlir", "w+") as f:
-        f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")
+    if mod_str is not None:
+        safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index a90824dae..a2dda1c7b 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -93,10 +93,12 @@ def compile_to_vmfb(
             [
                 "--iree-hal-target-backends=rocm",
                 "--iree-rocm-target-chip=" + target_triple,
-                "--verify=false",
                 "--iree-opt-const-eval=false",
+                "--iree-opt-data-tiling=False",
             ]
         )
+        if "unet" in safe_name:
+            flags.extend(["--iree-codegen-llvmgpu-use-vector-distribution"])
     elif device == "cuda":
         flags.extend(
             [
diff --git a/models/turbine_models/custom_models/sdxl_inference/default_mfma_attn_spec.mlir b/models/turbine_models/custom_models/sdxl_inference/default_mfma_attn_spec.mlir
index 794c83d99..4bbe76a1b 100644
--- a/models/turbine_models/custom_models/sdxl_inference/default_mfma_attn_spec.mlir
+++ b/models/turbine_models/custom_models/sdxl_inference/default_mfma_attn_spec.mlir
@@ -5,8 +5,8 @@
 // TODO: Figure out how to parameterize the tile sizes without duplicating
 // the attention function.
 
-#layout_16 = #iree_gpu.mfma_layout<F16_16x16x16_F32>
-#layout = #iree_gpu.mfma_layout<F16_32x32x8_F32>
+#layout_16 = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>
 
 module attributes { transform.with_named_sequence } {
 //===----------------------------------------------------------------------===//
@@ -27,7 +27,7 @@ module attributes { transform.with_named_sequence } {
   }
 
   // Script for FA2 transform pipeline when head_dim % 64 = 0.
-  transform.named_sequence @__attention_main(%variant_op: !transform.any_op {transform.consumed}) {
+  transform.named_sequence @__attention_main(%variant_op: !transform.any_op {transform.readonly}) {
     // Get attention op
     // ==========================================
     %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
@@ -150,32 +150,30 @@ module attributes { transform.with_named_sequence } {
       transform.apply_patterns.scf.for_loop_canonicalization
     } : !transform.any_op
     transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
     transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
 
     // Step 5. Pre-process the contract and transfer ops to put it in the right form.
     // ===========================================================================
-    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_2 {
+    transform.apply_patterns to %memref_func {
       transform.apply_patterns.iree.fold_arith_ext_into_contraction
     } : !transform.any_op
 
     // Step 6. Post-bufferization vector distribution
     // ===========================================================================
-    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [64, 4, 1] subgroup_size = 64 : (!transform.any_op) -> ()
+    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [64, 4, 1] subgroup_size = 64 : (!transform.any_op) -> ()
 
-    transform.apply_patterns to %func_7 {
+    transform.apply_patterns to %memref_func {
       transform.apply_patterns.memref.fold_memref_alias_ops
     } : !transform.any_op
-    transform.iree.apply_licm %func_7 : !transform.any_op
-    transform.apply_patterns to %func_7 {
+    transform.iree.apply_licm %memref_func : !transform.any_op
+    transform.apply_patterns to %memref_func {
       transform.apply_patterns.canonicalization
     } : !transform.any_op
-    transform.apply_cse to %func_7 : !transform.any_op
-    %func_8 = transform.structured.hoist_redundant_vector_transfers %func_7
+    transform.apply_cse to %memref_func : !transform.any_op
+    %func_8 = transform.structured.hoist_redundant_vector_transfers %memref_func
     : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %func_8 {
       transform.apply_patterns.canonicalization
@@ -187,17 +185,17 @@ module attributes { transform.with_named_sequence } {
     transform.apply_registered_pass "iree-amdgpu-prepare-chained-matmul" to %func_8 : (!transform.any_op) -> (!transform.any_op)
 
     // Get the vector.contract ops.
-    %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op_3 :  (!transform.any_op) -> !transform.any_op
+    %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
     %contract1, %contract2 = transform.split_handle %contracts : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
     transform.iree.set_contraction_layout_attributes %contract1, %layout16x16x16 { read_layout_indices = array<i64: 0, 1> } : !transform.any_op, !transform.any_param
     transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
 
-    %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.amdgpu_distribute_vectors %distribute_func : !transform.any_op
 
-    %distribute_func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %distribute_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
 
     transform.apply_patterns to %distribute_func_2 {
       transform.apply_patterns.canonicalization
@@ -206,34 +204,32 @@ module attributes { transform.with_named_sequence } {
 
     // Distribute shared memory copies
     // ==========================================
-    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.gpu_distribute_shared_memory_copy %func_10 : (!transform.any_op) -> ()
-    transform.apply_patterns to %func_10 {
+    transform.iree.gpu_distribute_shared_memory_copy %distribute_func_2 : (!transform.any_op) -> ()
+    transform.apply_patterns to %distribute_func_2 {
         transform.apply_patterns.memref.fold_memref_alias_ops
         transform.apply_patterns.canonicalization
         transform.apply_patterns.linalg.tiling_canonicalization
       } : !transform.any_op
-    transform.apply_cse to %func_10 : !transform.any_op
+    transform.apply_cse to %distribute_func_2 : !transform.any_op
 
-    %forop = transform.structured.match ops{["scf.for"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %forop = transform.structured.match ops{["scf.for"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
     %prefetched_forop = transform.iree.prefetch_shared_memory_copies %forop : (!transform.any_op) -> (!transform.any_op)
 
-    transform.apply_patterns to %func_10 {
+    transform.apply_patterns to %distribute_func_2 {
         transform.apply_patterns.memref.fold_memref_alias_ops
         transform.apply_patterns.canonicalization
         transform.apply_patterns.linalg.tiling_canonicalization
       } : !transform.any_op
-    transform.apply_cse to %func_10 : !transform.any_op
+    transform.apply_cse to %distribute_func_2 : !transform.any_op
 
-    %func_11 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.amdgpu.optimize_shared_memory_reads_and_writes %func_11 : (!transform.any_op) -> ()
+    transform.iree.reduce_shared_memory_bank_conflicts %distribute_func_2 : (!transform.any_op) -> ()
 
     transform.yield
   }
 
   // Script for FA2 transform pipeline for head_dim = 512.
   // For head_dim = 512, since the matmul is so big, and just try to do a single wave big load + big mfma.
-  transform.named_sequence @__attention_main_len_512(%variant_op: !transform.any_op {transform.consumed}) {
+  transform.named_sequence @__attention_main_len_512(%variant_op: !transform.any_op {transform.readonly}) {
     // Get attention op
     // ==========================================
     %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
@@ -356,32 +352,30 @@ module attributes { transform.with_named_sequence } {
       transform.apply_patterns.scf.for_loop_canonicalization
     } : !transform.any_op
     transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
     transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
 
     // Step 5. Pre-process the contract and transfer ops to put it in the right form.
     // ===========================================================================
-    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_2 {
+    transform.apply_patterns to %memref_func {
       transform.apply_patterns.iree.fold_arith_ext_into_contraction
     } : !transform.any_op
 
     // Step 6. Post-bufferization vector distribution
     // ===========================================================================
-    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [64, 4, 1] subgroup_size = 64 : (!transform.any_op) -> ()
+    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [64, 4, 1] subgroup_size = 64 : (!transform.any_op) -> ()
 
-    transform.apply_patterns to %func_7 {
+    transform.apply_patterns to %memref_func {
       transform.apply_patterns.memref.fold_memref_alias_ops
     } : !transform.any_op
-    transform.iree.apply_licm %func_7 : !transform.any_op
-    transform.apply_patterns to %func_7 {
+    transform.iree.apply_licm %memref_func : !transform.any_op
+    transform.apply_patterns to %memref_func {
       transform.apply_patterns.canonicalization
     } : !transform.any_op
-    transform.apply_cse to %func_7 : !transform.any_op
-    %func_8 = transform.structured.hoist_redundant_vector_transfers %func_7
+    transform.apply_cse to %memref_func : !transform.any_op
+    %func_8 = transform.structured.hoist_redundant_vector_transfers %memref_func
     : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %func_8 {
       transform.apply_patterns.canonicalization
@@ -392,20 +386,20 @@ module attributes { transform.with_named_sequence } {
     // Apply chained matmul optimization.
     transform.apply_registered_pass "iree-amdgpu-prepare-chained-matmul" to %func_8 : (!transform.any_op) -> (!transform.any_op)
 
-    // transform.print %variant_op_3 : !transform.any_op
+    // transform.print %memref_func : !transform.any_op
 
     // Get the vector.contract ops.
-    %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op_3 :  (!transform.any_op) -> !transform.any_op
+    %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
     %contract1, %contract2 = transform.split_handle %contracts : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %layout16x16x16 = transform.param.constant #layout_16 -> !transform.any_param
     transform.iree.set_contraction_layout_attributes %contract1, %layout16x16x16 { read_layout_indices = array<i64: 0, 1> } : !transform.any_op, !transform.any_param
     transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
 
-    %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.amdgpu_distribute_vectors %distribute_func : !transform.any_op
 
-    %distribute_func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %distribute_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
 
     transform.apply_patterns to %distribute_func_2 {
       transform.apply_patterns.canonicalization
@@ -414,7 +408,7 @@ module attributes { transform.with_named_sequence } {
 
     // Distribute shared memory copies
     // ==========================================
-    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_10 = transform.structured.match ops{["func.func"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
     transform.iree.gpu_distribute_shared_memory_copy %func_10 : (!transform.any_op) -> ()
     transform.apply_patterns to %func_10 {
         transform.apply_patterns.memref.fold_memref_alias_ops
@@ -423,7 +417,7 @@ module attributes { transform.with_named_sequence } {
       } : !transform.any_op
     transform.apply_cse to %func_10 : !transform.any_op
 
-    %forop = transform.structured.match ops{["scf.for"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %forop = transform.structured.match ops{["scf.for"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
     %prefetched_forop = transform.iree.prefetch_shared_memory_copies %forop : (!transform.any_op) -> (!transform.any_op)
 
     transform.apply_patterns to %func_10 {
@@ -433,18 +427,17 @@ module attributes { transform.with_named_sequence } {
       } : !transform.any_op
     transform.apply_cse to %func_10 : !transform.any_op
 
-    %func_11 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.amdgpu.optimize_shared_memory_reads_and_writes %func_11 : (!transform.any_op) -> ()
+    %func_11 = transform.structured.match ops{["func.func"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
+    transform.iree.reduce_shared_memory_bank_conflicts %func_11 : (!transform.any_op) -> ()
 
     transform.yield
   }
 
   // Send it down a custom transform dialect pipeline.
   transform.named_sequence @custom_attention_len_512(%attention: !transform.any_op {transform.readonly}) {
-    %variant_op = transform.get_parent_op %attention {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %func = transform.get_parent_op %attention {op_name = "func.func"} : (!transform.any_op) -> !transform.any_op
     %attn = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__attention_main_len_512, {"amdgpu-waves-per-eu" = 1}> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %attn : !transform.any_op, !transform.any_param
+    transform.annotate %func "translation_info" = %attn : !transform.any_op, !transform.any_param
     transform.yield
   }
 
@@ -457,10 +450,9 @@ module attributes { transform.with_named_sequence } {
 
   // Send it down a custom transform dialect pipeline.
   transform.named_sequence @custom_attention(%attention: !transform.any_op {transform.readonly}) {
-    %variant_op = transform.get_parent_op %attention {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %func = transform.get_parent_op %attention {op_name = "func.func"} : (!transform.any_op) -> !transform.any_op
     %attn = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__attention_main, {"amdgpu-waves-per-eu" = 2}> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %attn : !transform.any_op, !transform.any_param
+    transform.annotate %func "translation_info" = %attn : !transform.any_op, !transform.any_param
     transform.yield
   }
 
@@ -472,535 +464,6 @@ module attributes { transform.with_named_sequence } {
     transform.yield %attention : !transform.any_op
   }
 
-//===----------------------------------------------------------------------===//
-// Matmul tuning
-//===----------------------------------------------------------------------===//
-
-  transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
-    transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
-    // transform.print %root {name = "Generic"} : !transform.any_op
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
-      ^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf32>):
-      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
-                                            affine_map<(d0, d1, d2) -> (d1, d2)>,
-                                            affine_map<(d0, d1, d2) -> (d0, d1)>],
-                           iterator_types = ["parallel", "parallel", "reduction"]}
-          ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf32>) {
-        ^bb0(%in: f16, %in_0: f16, %acc: f32):
-          %8 = arith.extf %in : f16 to f32
-          %9 = arith.extf %in_0 : f16 to f32
-          %10 = arith.mulf %8, %9 : f32
-          %11 = arith.addf %acc, %10 : f32
-          linalg.yield %11 : f32
-        } -> tensor<?x?xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    transform.yield %root : !transform.any_op
-  }
-
-  transform.named_sequence @match_mmt_f16_f16_f16(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
-    transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
-    // transform.print %root {name = "Generic"} : !transform.any_op
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
-      ^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf16>):
-      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
-                                            affine_map<(d0, d1, d2) -> (d1, d2)>,
-                                            affine_map<(d0, d1, d2) -> (d0, d1)>],
-                           iterator_types = ["parallel", "parallel", "reduction"]}
-          ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf16>) {
-        ^bb0(%in: f16, %in_0: f16, %acc: f16):
-          %10 = arith.mulf %in, %in_0 : f16
-          %11 = arith.addf %acc, %10 : f16
-          linalg.yield %11 : f16
-        } -> tensor<?x?xf16>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    transform.yield %root : !transform.any_op
-  }
-
-  transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly}, %config: !transform.any_param {transform.readonly}) {
-    transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param
-    // transform.print %op {name = "Applied"} : !transform.any_op
-    transform.yield
-  }
-
-  transform.named_sequence @match_mmt_2048x10240x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
-    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<2048x1280xf16> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<10240x1280xf16> : !transform.any_value
-    %config = transform.param.constant #iree_codegen.compilation_info<
-      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 32]]>,
-      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-        {mma_schedule = #iree_gpu.mma_schedule<
-          intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-          subgroup_m_count = 2, subgroup_n_count = 2,
-          subgroup_m_tile_count = 4,
-          subgroup_n_tile_count = 4,
-          subgroup_k_tile_count = 2>, no_reorder_workgroups}>,
-      workgroup_size = [128, 2, 1], subgroup_size = 64
-     > -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_mmt_2048x1280x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
-    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<2048x1280xf16> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<1280x1280xf16> : !transform.any_value
-    %config = transform.param.constant #iree_codegen.compilation_info<
-      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 80, 64]]>,
-      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-        {mma_schedule = #iree_gpu.mma_schedule<
-          intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-          subgroup_m_count = 2, subgroup_n_count = 1,
-          subgroup_m_tile_count = 2,
-          subgroup_n_tile_count = 5,
-          subgroup_k_tile_count = 4>}>,
-      workgroup_size = [64, 2, 1], subgroup_size = 64
-     > -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_mmt_2048x1280x5120(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
-    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<2048x5120xf16> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<1280x5120xf16> : !transform.any_value
-    %config = transform.param.constant #iree_codegen.compilation_info<
-      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 160, 64]]>,
-      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-        {mma_schedule = #iree_gpu.mma_schedule<
-          intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-          subgroup_m_count = 2, subgroup_n_count = 2,
-          subgroup_m_tile_count = 2,
-          subgroup_n_tile_count = 5,
-          subgroup_k_tile_count = 4>}>,
-      workgroup_size = [128, 2, 1], subgroup_size = 64
-     > -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_mmt_128x1280x2048(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
-    %mmt = transform.include @match_mmt_f16_f16_f16 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<128x2048xf16> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<1280x2048xf16> : !transform.any_value
-    %config = transform.param.constant #iree_codegen.compilation_info<
-      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 256]]>,
-      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-        {mma_schedule = #iree_gpu.mma_schedule<
-          intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-          subgroup_m_count = 2, subgroup_n_count = 2,
-          subgroup_m_tile_count = 1,
-          subgroup_n_tile_count = 1,
-          subgroup_k_tile_count = 16>}>,
-      workgroup_size = [128, 2, 1], subgroup_size = 64
-     > -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_mmt_8192x640x2560(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
-    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<8192x2560xf16> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<640x2560xf16> : !transform.any_value
-    %config = transform.param.constant #iree_codegen.compilation_info<
-      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 160, 64]]>,
-      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-        {mma_schedule = #iree_gpu.mma_schedule<
-          intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-          subgroup_m_count = 2, subgroup_n_count = 2,
-          subgroup_m_tile_count = 2,
-          subgroup_n_tile_count = 5,
-          subgroup_k_tile_count = 4>}>,
-      workgroup_size = [128, 2, 1], subgroup_size = 64
-     > -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_mmt_8192x5120x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
-    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<8192x640xf16> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<5120x640xf16> : !transform.any_value
-    %config = transform.param.constant #iree_codegen.compilation_info<
-      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 32]]>,
-      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-        {mma_schedule = #iree_gpu.mma_schedule<
-          intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-          subgroup_m_count = 2, subgroup_n_count = 2,
-          subgroup_m_tile_count = 4,
-          subgroup_n_tile_count = 4,
-          subgroup_k_tile_count = 2>}>,
-      workgroup_size = [128, 2, 1], subgroup_size = 64
-     > -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_mmt_128x640x2048(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
-    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<128x2048xf16> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<640x2048xf16> : !transform.any_value
-    %config = transform.param.constant #iree_codegen.compilation_info<
-      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 16, 512]]>,
-      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-        {mma_schedule = #iree_gpu.mma_schedule<
-          intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-          subgroup_m_count = 2, subgroup_n_count = 1,
-          subgroup_m_tile_count = 1,
-          subgroup_n_tile_count = 1,
-          subgroup_k_tile_count = 32>}>,
-      workgroup_size = [64, 2, 1], subgroup_size = 64
-     > -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-  }
-
-//===----------------------------------------------------------------------===//
-// Convolution tuning
-//===----------------------------------------------------------------------===//
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x32x32x1280x3x3x1280(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x1280xf16>, %rhs: tensor<3x3x1280x1280xf16>, %out: tensor<2x32x32x1280xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x1280xf16>, tensor<3x3x1280x1280xf16>)
-        outs(%out : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 32, 160, 0, 0, 64], [0, 0, 0, 0, 1, 1, 0]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUConvVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 2, subgroup_n_count = 5,
-        subgroup_m_tile_count = 1,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 8>}>,
-      workgroup_size = [640, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x32x32x1280x3x3x1920(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x1920xf16>, %rhs: tensor<3x3x1920x1280xf16>, %out: tensor<2x32x32x1280xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x1920xf16>, tensor<3x3x1920x1280xf16>)
-        outs(%out : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 32, 160, 0, 0, 64], [0, 0, 0, 0, 1, 1, 0]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUConvVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 2, subgroup_n_count = 5,
-        subgroup_m_tile_count = 1,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 8>}>,
-      workgroup_size = [640, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x32x32x1280x3x3x2560(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x2560xf16>, %rhs: tensor<3x3x2560x1280xf16>, %out: tensor<2x32x32x1280xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x2560xf16>, tensor<3x3x2560x1280xf16>)
-        outs(%out : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 32, 160, 0, 0, 64], [0, 0, 0, 0, 1, 1, 0]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUConvVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 2, subgroup_n_count = 5,
-        subgroup_m_tile_count = 1,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 8>}>,
-      workgroup_size = [640, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x64x64x640x3x3x640(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x640xf16>, %rhs: tensor<3x3x640x640xf16>, %out: tensor<2x64x64x640xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x640xf16>, tensor<3x3x640x640xf16>)
-        outs(%out : tensor<2x64x64x640xf32>) -> tensor<2x64x64x640xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 64, 160, 0, 0, 32], [0, 0, 0, 0, 1, 1, 0]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUConvVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 2, subgroup_n_count = 5,
-        subgroup_m_tile_count = 2,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 4>}>,
-      workgroup_size = [640, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x64x64x640x3x3x1280(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x1280xf16>, %rhs: tensor<3x3x1280x640xf16>, %out: tensor<2x64x64x640xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x1280xf16>, tensor<3x3x1280x640xf16>)
-        outs(%out : tensor<2x64x64x640xf32>) -> tensor<2x64x64x640xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 64, 320, 1, 1, 80]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 1, subgroup_n_count = 10,
-        subgroup_m_tile_count = 2,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 10>}>,
-      workgroup_size = [640, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x64x64x640x3x3x1920(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x1920xf16>, %rhs: tensor<3x3x1920x640xf16>, %out: tensor<2x64x64x640xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x1920xf16>, tensor<3x3x1920x640xf16>)
-        outs(%out : tensor<2x64x64x640xf32>) -> tensor<2x64x64x640xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 64, 320, 1, 1, 80]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 1, subgroup_n_count = 10,
-        subgroup_m_tile_count = 2,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 10>}>,
-      workgroup_size = [640, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x64x64x1280x3x3x1280(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    transform.match.operation_name %conv ["linalg.conv_2d_nhwc_hwcf"] : !transform.any_op
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x66x66x1280xf16>, %rhs: tensor<3x3x1280x1280xf16>, %out: tensor<2x64x64x1280xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x66x66x1280xf16>, tensor<3x3x1280x1280xf16>)
-        outs(%out : tensor<2x64x64x1280xf32>) -> tensor<2x64x64x1280xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 64, 320, 1, 1, 80]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 1, subgroup_n_count = 10,
-        subgroup_m_tile_count = 2,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 10>}>,
-      workgroup_size = [640, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x128x128x320x3x3x320(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x320xf16>, %rhs: tensor<3x3x320x320xf16>, %out: tensor<2x128x128x320xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x320xf16>, tensor<3x3x320x320xf16>)
-        outs(%out : tensor<2x128x128x320xf32>) -> tensor<2x128x128x320xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 128, 320, 1, 1, 40]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 1, subgroup_n_count = 5,
-        subgroup_m_tile_count = 4,
-        subgroup_n_tile_count = 2,
-        subgroup_k_tile_count = 5>}>,
-      workgroup_size = [320, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x128x128x320x3x3x640(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x640xf16>, %rhs: tensor<3x3x640x320xf16>, %out: tensor<2x128x128x320xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x640xf16>, tensor<3x3x640x320xf16>)
-        outs(%out : tensor<2x128x128x320xf32>) -> tensor<2x128x128x320xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 128, 320, 1, 1, 40]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 1, subgroup_n_count = 5,
-        subgroup_m_tile_count = 4,
-        subgroup_n_tile_count = 2,
-        subgroup_k_tile_count = 5>}>,
-      workgroup_size = [320, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x128x128x320x3x3x960(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x960xf16>, %rhs: tensor<3x3x960x320xf16>, %out: tensor<2x128x128x320xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x960xf16>, tensor<3x3x960x320xf16>)
-        outs(%out : tensor<2x128x128x320xf32>) -> tensor<2x128x128x320xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 128, 320, 1, 1, 40]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 1, subgroup_n_count = 5,
-        subgroup_m_tile_count = 4,
-        subgroup_n_tile_count = 2,
-        subgroup_k_tile_count = 5>}>,
-      workgroup_size = [320, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_conv_2d_nhwc_hwcf_2x128x128x640x3x3x640(%conv: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {
-    ^bb0(%lhs: tensor<2x?x?x640xf16>, %rhs: tensor<3x3x640x640xf16>, %out: tensor<2x128x128x640xf32>):
-      %13 = linalg.conv_2d_nhwc_hwcf { dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64> }
-        ins(%lhs, %rhs : tensor<2x?x?x640xf16>, tensor<3x3x640x640xf16>)
-        outs(%out : tensor<2x128x128x640xf32>) -> tensor<2x128x128x640xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 128, 128, 1, 1, 32]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 1, subgroup_n_count = 4,
-        subgroup_m_tile_count = 4,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 4>}>,
-      workgroup_size = [256, 1, 1], subgroup_size = 64> -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-  }
-
-//===----------------------------------------------------------------------===//
-// Contraction tuning
-//===----------------------------------------------------------------------===//
-
-  transform.named_sequence @match_contract_2x1024x1280x20x64(%contract: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %contract {
-    ^bb0(%lhs: tensor<2x20x1024x64xf16>, %rhs: tensor<1280x20x64xf16>, %out: tensor<2x1024x1280xf32>):
-      %20 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d4)>,
-                           affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>,
-                           affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>],
-          iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
-        } ins(%lhs, %rhs : tensor<2x20x1024x64xf16>, tensor<1280x20x64xf16>)
-          outs(%out : tensor<2x1024x1280xf32>) {
-        ^bb0(%in: f16, %in_0: f16, %acc: f32):
-          %22 = arith.extf %in : f16 to f32
-          %23 = arith.extf %in_0 : f16 to f32
-          %24 = arith.mulf %22, %23 : f32
-          %25 = arith.addf %acc, %24 : f32
-          linalg.yield %25 : f32
-        } -> tensor<2x1024x1280xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 160, 1, 64]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-        subgroup_m_count = 2, subgroup_n_count = 2,
-        subgroup_m_tile_count = 2,
-        subgroup_n_tile_count = 5,
-        subgroup_k_tile_count = 4>}>,
-      workgroup_size = [128, 2, 1], subgroup_size = 64> -> !transform.any_param
-    // transform.print %contract {name = "Contract"} : !transform.any_op
-    transform.yield %contract, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_contract_2x2x20x64x64x2048(%contract: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %contract {
-    ^bb0(%lhs: tensor<2x64x2048xf16>, %rhs: tensor<2x20x64x2048xf16>, %out: tensor<2x2x20x64x64xf32>):
-      %10 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>,
-                           affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>,
-                           affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4)>],
-          iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction"]
-        } ins(%lhs, %rhs : tensor<2x64x2048xf16>, tensor<2x20x64x2048xf16>)
-          outs(%out : tensor<2x2x20x64x64xf32>) {
-      ^bb0(%in: f16, %in_0: f16, %acc: f32):
-        %12 = arith.extf %in : f16 to f32
-        %13 = arith.extf %in_0 : f16 to f32
-        %14 = arith.mulf %12, %13 : f32
-        %15 = arith.addf %acc, %14 : f32
-        linalg.yield %15 : f32
-      } -> tensor<2x2x20x64x64xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 1, 32, 16, 128]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>,
-        subgroup_m_count = 2, subgroup_n_count = 1,
-        subgroup_m_tile_count = 1,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 8>}>,
-      workgroup_size = [64, 2, 1], subgroup_size = 64> -> !transform.any_param
-    // transform.print %contract {name = "Contract"} : !transform.any_op
-    transform.yield %contract, %config : !transform.any_op, !transform.any_param
-  }
-
-  transform.named_sequence @match_contract_3x2x20x64x64x1280(%contract: !transform.any_op {transform.readonly})
-    -> (!transform.any_op, !transform.any_param) {
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %contract {
-    ^bb0(%lhs: tensor<2x1024x1280xf16>, %rhs: tensor<3x20x64x1280xf16>, %out: tensor<3x2x20x1024x64xf32>):
-      %14 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>,
-                           affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>,
-                           affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4)>],
-          iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction"]
-        } ins(%lhs, %rhs : tensor<2x1024x1280xf16>, tensor<3x20x64x1280xf16>)
-          outs(%out : tensor<3x2x20x1024x64xf32>) {
-      ^bb0(%in: f16, %in_0: f16, %acc: f32):
-        %16 = arith.extf %in : f16 to f32
-        %17 = arith.extf %in_0 : f16 to f32
-        %18 = arith.mulf %16, %17 : f32
-        %19 = arith.addf %acc, %18 : f32
-        linalg.yield %19 : f32
-      } -> tensor<3x2x20x1024x64xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    %config = transform.param.constant #iree_codegen.compilation_info<
-    lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 1, 128, 64, 64]]>,
-    translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute,
-      {mma_schedule = #iree_gpu.mma_schedule<
-        intrinsic = #iree_gpu.mfma_layout<F16_32x32x8_F32>,
-        subgroup_m_count = 2, subgroup_n_count = 2,
-        subgroup_m_tile_count = 2,
-        subgroup_n_tile_count = 1,
-        subgroup_k_tile_count = 8>}>,
-      workgroup_size = [128, 2, 1], subgroup_size = 64> -> !transform.any_param
-    // transform.print %contract {name = "Contract"} : !transform.any_op
-    transform.yield %contract, %config : !transform.any_op, !transform.any_param
-  }
-
 //===----------------------------------------------------------------------===//
 // Entry point
 //===----------------------------------------------------------------------===//
@@ -1009,31 +472,7 @@ module attributes { transform.with_named_sequence } {
     transform.foreach_match in %variant_op
         // Attention.
         @match_attention_len_512 -> @custom_attention_len_512,
-        @match_attention -> @custom_attention,
-        // Matmul tuning.
-        @match_mmt_2048x10240x1280 -> @apply_op_config,
-        @match_mmt_2048x1280x1280 -> @apply_op_config,
-        @match_mmt_2048x1280x5120 -> @apply_op_config,
-        @match_mmt_128x1280x2048 -> @apply_op_config,
-        @match_mmt_128x640x2048 -> @apply_op_config,
-        @match_mmt_8192x640x2560 -> @apply_op_config,
-        @match_mmt_8192x5120x640 -> @apply_op_config,
-        // Convolution tuning.
-        @match_conv_2d_nhwc_hwcf_2x32x32x1280x3x3x1280 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x32x32x1280x3x3x1920 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x32x32x1280x3x3x2560 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x64x64x640x3x3x640 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x64x64x640x3x3x1280 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x64x64x640x3x3x1920 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x64x64x1280x3x3x1280 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x128x128x320x3x3x320 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x128x128x320x3x3x640 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x128x128x320x3x3x960 -> @apply_op_config,
-        @match_conv_2d_nhwc_hwcf_2x128x128x640x3x3x640 -> @apply_op_config,
-        // Contract tuning.
-        @match_contract_2x1024x1280x20x64 -> @apply_op_config,
-        @match_contract_2x2x20x64x64x2048 -> @apply_op_config,
-        @match_contract_3x2x20x64x64x1280 -> @apply_op_config
+        @match_attention -> @custom_attention
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 1c6b6331c..bf3d606e5 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -121,7 +121,7 @@ def export_prompt_encoder(
     else:
         do_classifier_free_guidance = True
 
-    if (attn_spec in ["default"]) and ("gfx94" in target_triple):
+    if (attn_spec in ["default"]) and ("gfx9" in target_triple):
         attn_spec = os.path.join(
             os.path.realpath(os.path.dirname(__file__)), "default_mfma_attn_spec.mlir"
         )
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index e9839ba06..7ae695449 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -108,7 +108,7 @@ def export_unet_model(
     if (
         (attn_spec in ["default"])
         and decomp_attn == False
-        and ("gfx9" in target_triple)
+        and ("gfx" in target_triple)
     ):
         attn_spec = os.path.join(
             os.path.realpath(os.path.dirname(__file__)), "default_mfma_attn_spec.mlir"

From ccbe1ddc6b467f295b73ac53e5378050de52146a Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 17 Apr 2024 11:43:32 -0500
Subject: [PATCH 002/174] Formatting and compile flag fixes.

---
 .../custom_models/sd_inference/unet.py        |  2 +-
 .../custom_models/sd_inference/utils.py       | 26 ++++---------------
 .../custom_models/sdxl_inference/unet.py      |  6 +----
 3 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 21ee83327..d5ee63ae1 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -194,4 +194,4 @@ def main(
         safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(mod_str)
-        print("Saved to", safe_name + ".mlir")
\ No newline at end of file
+        print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index a2dda1c7b..1bc7fd0d0 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -14,34 +14,18 @@
     "all": [
         "--iree-global-opt-propagate-transposes=true",
         "--iree-opt-outer-dim-concat=true",
-        "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
         "--iree-vm-target-truncate-unsupported-floats",
         "--iree-llvmgpu-enable-prefetch=true",
         "--verify=false",
-        "--iree-rocm-waves-per-eu=2",
         "--iree-opt-data-tiling=false",
-        "--iree-codegen-log-swizzle-tile=4",
-        "--iree-llvmgpu-promote-filter=true",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
     ],
     "unet": [
-        "--iree-codegen-llvmgpu-use-conv-vector-distribute-pipeline",
-        "--iree-codegen-llvmgpu-reduce-skinny-matmuls",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution",
-        "--iree-codegen-winograd-use-forall",
-    ],
-    "clip": [
-        "--iree-codegen-llvmgpu-use-vector-distribution",
-        "--iree-codegen-llvmgpu-reduce-skinny-matmuls",
-        "--iree-global-opt-only-sink-transposes=true",
-    ],
-    "vae": [
-        "--iree-codegen-llvmgpu-use-conv-vector-distribute-pipeline",
-        "--iree-codegen-llvmgpu-use-vector-distribution",
-        "--iree-global-opt-only-sink-transposes=true",
-        "--iree-codegen-winograd-use-forall",
     ],
+    "clip": [],
+    "vae": [],
 }
 
 
@@ -97,8 +81,8 @@ def compile_to_vmfb(
                 "--iree-opt-data-tiling=False",
             ]
         )
-        if "unet" in safe_name:
-            flags.extend(["--iree-codegen-llvmgpu-use-vector-distribution"])
+        if target_triple == "gfx942":
+            flags.extend(["--iree-rocm-waves-per-eu=2"])
     elif device == "cuda":
         flags.extend(
             [
@@ -124,7 +108,7 @@ def compile_to_vmfb(
         if flag not in [None, "", " "]:
             flags.append(flag)
 
-    if target_triple in ["gfx940", "gfx941", "gfx942"]:
+    if target_triple in ["gfx940", "gfx941", "gfx942", "gfx1100"]:
         if "unet" in safe_name:
             flags.extend(gfx94X_flags["unet"])
         elif any(x in safe_name for x in ["clip", "prompt_encoder"]):
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 7ae695449..d3154a3af 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -105,11 +105,7 @@ def export_unet_model(
     else:
         do_classifier_free_guidance = True
 
-    if (
-        (attn_spec in ["default"])
-        and decomp_attn == False
-        and ("gfx" in target_triple)
-    ):
+    if (attn_spec in ["default"]) and decomp_attn == False and ("gfx" in target_triple):
         attn_spec = os.path.join(
             os.path.realpath(os.path.dirname(__file__)), "default_mfma_attn_spec.mlir"
         )

From 76cc4db5230e23611121d9aff225769461e5ce00 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 17 Apr 2024 11:48:44 -0500
Subject: [PATCH 003/174] Enable e2e sdxl test on mi250.

---
 models/turbine_models/tests/sdxl_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index a45fd7ca4..e8b359248 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -502,7 +502,7 @@ def test04_ExportVaeModelEncode(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test05_t2i_generate_images(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("Have issues with submodels on these backends")
         mlirs = {
             "vae_decode": None,

From 5feaf4df4cd9d7a001493c305b0b53dd88f595c8 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 17 Apr 2024 12:51:24 -0500
Subject: [PATCH 004/174] Don't decompose attention in sdxl tests by default.

---
 models/turbine_models/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/tests/conftest.py b/models/turbine_models/tests/conftest.py
index 7a1f55b1a..1c1952605 100644
--- a/models/turbine_models/tests/conftest.py
+++ b/models/turbine_models/tests/conftest.py
@@ -36,7 +36,7 @@ def pytest_addoption(parser):
     # General Options
     parser.addoption("--compile_to", action="store", default=None)
     parser.addoption("--external_weights", action="store", default="safetensors")
-    parser.addoption("--decomp_attn", action="store", default=True)
+    parser.addoption("--decomp_attn", action="store", default=False)
     parser.addoption("--attn_spec", action="store", default="")
     # Compiler Options
     parser.addoption("--device", action="store", default="cpu")

From 15c46f0e081b32530f292aca11ddaa4406a27ebc Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 17 Apr 2024 17:20:27 -0500
Subject: [PATCH 005/174] Remove xfails for rocm on submodels.

---
 models/turbine_models/tests/sdxl_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index e8b359248..ef66d1572 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -93,7 +93,7 @@ def setUp(self):
         )
 
     def test01_ExportClipModels(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest(
                 "Compilation error on vulkan; Runtime error on rocm; To be tested on cuda."
             )
@@ -215,7 +215,7 @@ def test01_ExportClipModels(self):
         np.testing.assert_allclose(torch_output_2, turbine_2[0], rtol, atol)
 
     def test02_ExportUnetModel(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest(
                 "Unknown error on vulkan; Runtime error on rocm; To be tested on cuda."
             )
@@ -325,7 +325,7 @@ def test02_ExportUnetModel(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test03_ExportVaeModelDecode(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest(
                 "Compilation error on vulkan; Runtime error on rocm; To be tested on cuda."
             )

From 452ff7cf980cf30c735b71aed0e041535000e764 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 17 Apr 2024 18:43:50 -0500
Subject: [PATCH 006/174] Change default attention spec path string in unet
 script.

---
 models/turbine_models/custom_models/sdxl_inference/unet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index d3154a3af..75a535814 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -96,7 +96,7 @@ def export_unet_model(
     ireec_flags=None,
     decomp_attn=False,
     exit_on_vmfb=False,
-    attn_spec=None,
+    attn_spec="default",
     input_mlir=None,
     weights_only=False,
 ):

From 9e3a153128d65880c1eb5cbbfad46a5bb6a8d577 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 17 Apr 2024 19:37:52 -0500
Subject: [PATCH 007/174] Move attn spec and simplify parsing.

---
 .../default_mfma_attn_spec.mlir                       |  0
 .../custom_models/sd_inference/utils.py               |  5 +++++
 .../sdxl_inference/sdxl_prompt_encoder.py             |  8 --------
 .../sdxl_inference/sdxl_scheduled_unet.py             | 10 ----------
 .../custom_models/sdxl_inference/unet.py              |  7 -------
 .../custom_models/sdxl_inference/vae.py               | 11 -----------
 6 files changed, 5 insertions(+), 36 deletions(-)
 rename models/turbine_models/custom_models/{sdxl_inference => sd_inference}/default_mfma_attn_spec.mlir (100%)

diff --git a/models/turbine_models/custom_models/sdxl_inference/default_mfma_attn_spec.mlir b/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
similarity index 100%
rename from models/turbine_models/custom_models/sdxl_inference/default_mfma_attn_spec.mlir
rename to models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 1bc7fd0d0..afeb567e9 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -118,6 +118,11 @@ def compile_to_vmfb(
         flags.extend(gfx94X_flags["all"])
 
     if attn_spec not in [None, "", " "]:
+        if (attn_spec in ["default"]) and ("gfx" in target_triple):
+            attn_spec = os.path.join(
+                os.path.realpath(os.path.dirname(__file__)),
+                "default_mfma_attn_spec.mlir",
+            )
         flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
 
     print("Compiling to", device, "with flags:", flags)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index bf3d606e5..1f56031ed 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -120,14 +120,6 @@ def export_prompt_encoder(
         do_classifier_free_guidance = False
     else:
         do_classifier_free_guidance = True
-
-    if (attn_spec in ["default"]) and ("gfx9" in target_triple):
-        attn_spec = os.path.join(
-            os.path.realpath(os.path.dirname(__file__)), "default_mfma_attn_spec.mlir"
-        )
-    else:
-        attn_spec = None
-
     if pipeline_dir not in [None, ""]:
         safe_name = os.path.join(pipeline_dir, "prompt_encoder")
     else:
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index f74c707e7..fa0db44ba 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -139,16 +139,6 @@ def export_scheduled_unet_model(
         do_classifier_free_guidance = False
     else:
         do_classifier_free_guidance = True
-    if (
-        (attn_spec in ["default"])
-        and decomp_attn == False
-        and ("gfx9" in iree_target_triple)
-    ):
-        attn_spec = os.path.join(
-            os.path.realpath(os.path.dirname(__file__)), "default_mfma_attn_spec.mlir"
-        )
-    elif decomp_attn:
-        attn_spec = None
 
     if pipeline_dir:
         safe_name = os.path.join(
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 75a535814..270e3f44f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -105,13 +105,6 @@ def export_unet_model(
     else:
         do_classifier_free_guidance = True
 
-    if (attn_spec in ["default"]) and decomp_attn == False and ("gfx" in target_triple):
-        attn_spec = os.path.join(
-            os.path.realpath(os.path.dirname(__file__)), "default_mfma_attn_spec.mlir"
-        )
-    elif decomp_attn:
-        attn_spec = None
-
     safe_name = utils.create_safe_name(
         hf_model_name, f"_{max_length}_{height}x{width}_{precision}_unet_{device}"
     )
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index 7563eed96..dd3adb525 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -84,17 +84,6 @@ def export_vae_model(
     input_mlir=None,
     weights_only=False,
 ):
-    if (
-        (attn_spec in ["default"])
-        and decomp_attn == False
-        and ("gfx9" in target_triple)
-    ):
-        attn_spec = os.path.join(
-            os.path.realpath(os.path.dirname(__file__)), "default_mfma_attn_spec.mlir"
-        )
-    elif decomp_attn:
-        attn_spec = None
-
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, "vae_" + variant)
     else:

From e97421c936e3662dfff8431ac81e6ef4773184dc Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 17 Apr 2024 21:52:39 -0500
Subject: [PATCH 008/174] Fix attn spec handling a bit more.

---
 .github/workflows/test_models.yml                          | 2 +-
 models/turbine_models/custom_models/sd_inference/utils.py  | 2 +-
 models/turbine_models/custom_models/sdxl_inference/unet.py | 2 +-
 models/turbine_models/tests/sdxl_test.py                   | 2 ++
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index b7facb903..8f400eaf8 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -71,5 +71,5 @@ jobs:
           pytest -v models/turbine_models/tests/sd_test.py
           pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu
           pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
-          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16
+          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
 
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index afeb567e9..062d03381 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -118,7 +118,7 @@ def compile_to_vmfb(
         flags.extend(gfx94X_flags["all"])
 
     if attn_spec not in [None, "", " "]:
-        if (attn_spec in ["default"]) and ("gfx" in target_triple):
+        if attn_spec == "default":
             attn_spec = os.path.join(
                 os.path.realpath(os.path.dirname(__file__)),
                 "default_mfma_attn_spec.mlir",
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 270e3f44f..a02a72586 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -96,7 +96,7 @@ def export_unet_model(
     ireec_flags=None,
     decomp_attn=False,
     exit_on_vmfb=False,
-    attn_spec="default",
+    attn_spec=None,
     input_mlir=None,
     weights_only=False,
 ):
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index ef66d1572..da10c96f5 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -240,6 +240,7 @@ def test02_ExportUnetModel(self):
             target_triple=arguments["iree_target_triple"],
             ireec_flags=arguments["ireec_flags"],
             decomp_attn=arguments["decomp_attn"],
+            attn_spec=arguments["attn_spec"],
         )
         arguments["external_weight_path"] = (
             self.safe_model_name
@@ -349,6 +350,7 @@ def test03_ExportVaeModelDecode(self):
             ireec_flags=arguments["ireec_flags"],
             variant="decode",
             decomp_attn=arguments["decomp_attn"],
+            attn_spec=arguments["attn_spec"],
             exit_on_vmfb=True,
         )
         arguments["external_weight_path"] = (

From 21f71cdc3e59464865b14d4380ee5fb7b51ed67c Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 18 Apr 2024 10:39:33 -0500
Subject: [PATCH 009/174] compile VAE last and disable decomp_attn entirely by
 default.

---
 .../custom_models/sdxl_inference/sdxl_cmd_opts.py         | 2 +-
 .../sdxl_inference/sdxl_compiled_pipeline.py              | 8 ++++----
 .../custom_models/sdxl_inference/sdxl_scheduled_unet.py   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
index f2faa0323..8921847ad 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
@@ -170,7 +170,7 @@ def is_valid_file(arg):
 p.add_argument(
     "--vae_decomp_attn",
     type=bool,
-    default=True,
+    default=False,
     help="Decompose attention for VAE decode only at fx graph level",
 )
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index f17a17f60..7c7bf5f48 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -613,30 +613,30 @@ def numpy_to_pil_image(images):
     from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
 
     mlirs = {
-        "vae_decode": None,
         "prompt_encoder": None,
         "scheduled_unet": None,
+        "vae_decode": None,
         "pipeline": None,
         "full_pipeline": None,
     }
     vmfbs = {
-        "vae_decode": None,
         "prompt_encoder": None,
         "scheduled_unet": None,
+        "vae_decode": None,
         "pipeline": None,
         "full_pipeline": None,
     }
     weights = {
-        "vae_decode": None,
         "prompt_encoder": None,
         "scheduled_unet": None,
+        "vae_decode": None,
         "pipeline": None,
         "full_pipeline": None,
     }
     ireec_flags = {
+        "clip": args.ireec_flags + args.clip_flags,
         "unet": args.ireec_flags + args.unet_flags,
         "vae": args.ireec_flags + args.vae_flags,
-        "clip": args.ireec_flags + args.clip_flags,
         "pipeline": args.ireec_flags,
     }
     if not args.pipeline_dir:
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index fa0db44ba..026e1b62e 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -173,7 +173,7 @@ def export_scheduled_unet_model(
                 torch.ops.aten._scaled_dot_product_flash_attention.default,
             ]
         )
-
+    print(decomp_list)
     dtype = torch.float16 if precision == "fp16" else torch.float32
 
     if precision == "fp16":

From 4a18301e5cdc6270e6815e531c4cbdf930b802f3 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Thu, 18 Apr 2024 11:08:47 -0500
Subject: [PATCH 010/174] Update SDXL readme.

---
 .../custom_models/sdxl_inference/README.md    | 66 +++++++++++++++----
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/README.md b/models/turbine_models/custom_models/sdxl_inference/README.md
index 19783c146..cce3aeafd 100644
--- a/models/turbine_models/custom_models/sdxl_inference/README.md
+++ b/models/turbine_models/custom_models/sdxl_inference/README.md
@@ -1,29 +1,67 @@
-# Stable Diffusion Commands
+# Stable Diffusion XL with SHARK-Turbine
 
-## Run and benchmark the entire SDXL pipeline on MI300
- - note: the command below is specifically for use on the ppac-pla-s22-35 instance. you may need to tweak paths accordingly.
- - follow "setup repository" in the next section
- - optional: set HF_HOME to save dl time/ disk usage
+## Setup SHARK-Turbine for importing and running the SDXL pipeline or submodels.
+
+Linux:
+```shell
+python -m venv turbine_venv
+source turbine_venv/bin/activate
+python -m pip install --upgrade pip
+pip install -r core/pytorch-cpu-requirements.txt
+pip install --pre --upgrade -r core/requirements.txt
+pip install --pre -e core
+pip install --pre --upgrade -e models -r models/requirements.txt
 ```
-export HF_HOME=/mnt/dcgpuval/huggingface/     #ppac
-export HF_HOME=/data/huggingface-cache        #banff
+
+Windows:
+```shell
+python -m venv turbine_venv
+turbine_venv/Scripts/activate
+python -m pip install --upgrade pip
+pip install -r core/pytorch-cpu-requirements.txt
+pip install --pre --upgrade -r core/requirements.txt
+pip install --pre -e core
+pip install --pre --upgrade -e models -r models/requirements.txt
 ```
- - make sure you have ROCM working with IREE, check `iree-run-module --dump_devices`
- - make a file called "mfma_spec.mlir" and drop in the contents of the TD script https://github.com/nod-ai/2024-q1-sdxl-sprint/tree/main/specs.
 
-### Newest pipeline command, weights (as of [SHARK-Turbine@ean-sd-fp16:6251fbef9233c406093dab056a08cd42cfc54a0b](https://github.com/nod-ai/SHARK-Turbine/commit/6251fbef9233c406093dab056a08cd42cfc54a0b)):
+## Run tests
+ROCM:
+```
+pytest models/turbine_models/tests/sdxl_test.py --device=rocm --rt_device=<hip/rocm> --iree_target_triple=gfx<ID> --external_weights=safetensors
+```
 
+CPU:
+```
+pytest models/turbine_models/tests/sdxl_test.py --device=cpu --rt_device=local-task --iree_target_triple=x86-64_linux_gnu --external_weights=safetensors --precision=fp32
+```
+
+## Run image generation pipeline
 
-gfx940:
+ROCM:
 ```
-python SHARK-Turbine/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py --precision=fp16 --external_weights=safetensors --device=rocm --rt_device=rocm --iree_target_triple=gfx942 --scheduler_id=PNDM --num_inference_steps=30 --pipeline_dir=./sdxl_fp16_1024x1024_gfx940/ --external_weights_dir=./weights_fp16/ --attn_spec=default
+python models\turbine_models\custom_models\sdxl_inference\sdxl_compiled_pipeline.py --iree_target_triple=gfx1100 --device=rocm --rt_device=hip --external_weights=safetensors
 ```
+For mfma-capable hardware, use `--attn_spec=default` to lower attention ops to MFMA instructions.
 
-gfx942:
+CPU:
 ```
-python SHARK-Turbine/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py --precision=fp16 --external_weights=safetensors --device=rocm --rt_device=rocm --iree_target_triple=gfx940 --scheduler_id=PNDM --num_inference_steps=30 --pipeline_dir=./sdxl_fp16_1024x1024_gfx940/ --external_weights_dir=./weights_fp16/ --attn_spec=default
+pytest models/turbine_models/tests/sdxl_test.py --device=cpu --rt_device=local-task --iree_target_triple=x86-64_linux_gnu --external_weights=safetensors --precision=fp32
 ```
 
+## Shared CLI options
+ - `--iree_target_triple`: use gfx1100 for 7900xt, gfx90a for MI210/MI250, gfx940 for MI300A, gfx942 for MI300X. For CPU, use x86_64-linux-gnu if you aren't sure. On Vulkan, this is something like `rdna3-7900-windows`.
+ - `--rt_device`: if using pip install, `hip` will work correctly, but `rocm` will not. Source builds of IREE can support rocm with the `-DIREE_HAL_DRIVER_ROCM=ON -DIREE_EXTERNAL_HAL_DRIVERS="rocm"`, but that option is soon to be deprecated in favor of the HIP driver.
+ - `--compiled_pipeline`: run one-shot SDXL in a MLIR wrapper, removing model glue from python execution layer
+ - `--pipeline_dir`: directory in which to save or look for .vmfb files.
+ - `--external_weights_dir`: directory in which to save or look for weights.
+ - `--ireec_flags`: extra ireec flags to use for _all_ submodels.
+ - `--unet_flags / --vae_flags / --clip_flags`: extra ireec flags for individual submodels.
+ - `--precision`: fp16 or fp32. Default is fp16 and you should only use fp32 for cpu.
+ - `--num_inference_steps`: (default 30) number of unet iterations to run.
+ - `--batch_count`: Not compatible with `--compiled_pipeline`. Uses the same clip output to generate a set of images in a batch, with different image latents.
+ - `--prompt / --negative_prompt`: prompts for stable diffusion inference
+
+
 Note: the following "prompt_encoder_f16.irpa" contains weights for both clip1 and clip2.
 The pipeline script will look for these filenames in the specified "external_weights_dir" under "prompt_encoder.irpa", "vae_decode.irpa", "scheduled_unet.irpa".
 It's not ideal in current state, but will be smoothed out now that general pipeline structure and file management needs are stable.

From 7918824c6ae0531a79efeca2b9de71ac99b1a7f1 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 18 Apr 2024 11:16:40 -0500
Subject: [PATCH 011/174] Remove print statement.

---
 .../custom_models/sdxl_inference/sdxl_scheduled_unet.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 026e1b62e..4c30d5e65 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -173,7 +173,6 @@ def export_scheduled_unet_model(
                 torch.ops.aten._scaled_dot_product_flash_attention.default,
             ]
         )
-    print(decomp_list)
     dtype = torch.float16 if precision == "fp16" else torch.float32
 
     if precision == "fp16":

From d8839bbdeebcaa584b388daf34fac02610deaeaf Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 18 Apr 2024 11:19:49 -0500
Subject: [PATCH 012/174] Add flags to compile step for gfx90a

---
 models/turbine_models/custom_models/sd_inference/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 062d03381..cd68dbd7b 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -108,7 +108,7 @@ def compile_to_vmfb(
         if flag not in [None, "", " "]:
             flags.append(flag)
 
-    if target_triple in ["gfx940", "gfx941", "gfx942", "gfx1100"]:
+    if target_triple in ["gfx940", "gfx941", "gfx942", "gfx1100", "gfx90a"]:
         if "unet" in safe_name:
             flags.extend(gfx94X_flags["unet"])
         elif any(x in safe_name for x in ["clip", "prompt_encoder"]):

From 73a63287a744bbf7b73a3597c3f74d53127d1c53 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Thu, 18 Apr 2024 23:59:24 -0500
Subject: [PATCH 013/174] Add support table for gfx targets to sdxl readme.

---
 .../custom_models/sdxl_inference/README.md          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/models/turbine_models/custom_models/sdxl_inference/README.md b/models/turbine_models/custom_models/sdxl_inference/README.md
index cce3aeafd..8f82e3e38 100644
--- a/models/turbine_models/custom_models/sdxl_inference/README.md
+++ b/models/turbine_models/custom_models/sdxl_inference/README.md
@@ -1,5 +1,18 @@
 # Stable Diffusion XL with SHARK-Turbine
 
+## Support
+
+Following is a table that shows current status of turbine SDXL inference support for a few AMDGPU targets. This is not an exhaustive list of supported targets.
+
+| Target Chip | Attention Decomposed? | CLIP          | UNet                           | VAE Decode                     | Txt2Img        |
+|-------------|-----------------------|---------------|--------------------------------|--------------------------------|----------------|
+| gfx1100     | Yes                   | 💚 | 💛 (numerics with vector distribution)| 💚                  | 💚  |
+|             | No                    |               | 💔 (Attn lowering) | 💔 (Attn lowering) | 💔 |
+| gfx90a      | Yes                   | 💚 | 💚                  | 💚                  | 💚  |
+|             | No                    |               | 💔 (Shared Memory) | 💚                  | 💔 |
+| gfx942      | Yes                   | 💚 | 💚                  | 💚                  | 💚  |
+|             | No                    |               | 💚                  | 💚                  | 💚  |
+
 ## Setup SHARK-Turbine for importing and running the SDXL pipeline or submodels.
 
 Linux:

From ead97cfe72e4661787b7c1a796d066689af0bb3b Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Fri, 19 Apr 2024 12:14:54 -0500
Subject: [PATCH 014/174] Switch MI250 run to cfg B

---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index 8f400eaf8..d6d054f7b 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -71,5 +71,5 @@ jobs:
           pytest -v models/turbine_models/tests/sd_test.py
           pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu
           pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
-          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
+          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default --decomp_attn True
 

From 1915ad92e56a7f6bb1d14da93d7f5e7a38db3780 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Fri, 19 Apr 2024 18:01:06 -0500
Subject: [PATCH 015/174] Update xfails

---
 models/turbine_models/tests/sdxl_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index da10c96f5..24af91096 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -215,7 +215,7 @@ def test01_ExportClipModels(self):
         np.testing.assert_allclose(torch_output_2, turbine_2[0], rtol, atol)
 
     def test02_ExportUnetModel(self):
-        if arguments["device"] in ["vulkan", "cuda"]:
+        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
             self.skipTest(
                 "Unknown error on vulkan; Runtime error on rocm; To be tested on cuda."
             )
@@ -328,7 +328,7 @@ def test02_ExportUnetModel(self):
     def test03_ExportVaeModelDecode(self):
         if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest(
-                "Compilation error on vulkan; Runtime error on rocm; To be tested on cuda."
+                "Compilation error on vulkan; To be tested on cuda."
             )
         vae.export_vae_model(
             vae_model=self.vae_model,
@@ -504,7 +504,7 @@ def test04_ExportVaeModelEncode(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test05_t2i_generate_images(self):
-        if arguments["device"] in ["vulkan", "cuda"]:
+        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
             self.skipTest("Have issues with submodels on these backends")
         mlirs = {
             "vae_decode": None,

From b582ef074dc01b82db2c2b7a36e5caddb56a433b Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Fri, 19 Apr 2024 18:02:51 -0500
Subject: [PATCH 016/174] formatting

---
 models/turbine_models/tests/sdxl_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index 24af91096..7dc976944 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -327,9 +327,7 @@ def test02_ExportUnetModel(self):
 
     def test03_ExportVaeModelDecode(self):
         if arguments["device"] in ["vulkan", "cuda"]:
-            self.skipTest(
-                "Compilation error on vulkan; To be tested on cuda."
-            )
+            self.skipTest("Compilation error on vulkan; To be tested on cuda.")
         vae.export_vae_model(
             vae_model=self.vae_model,
             # This is a public model, so no auth required

From 110a64810a093653a59b8a583560d91afd8fd3e6 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Fri, 19 Apr 2024 22:36:11 -0500
Subject: [PATCH 017/174] tolerance adjust for vae test and flag tweaks

---
 .../custom_models/sd_inference/utils.py       | 20 +++++++++++--------
 models/turbine_models/tests/sdxl_test.py      |  2 +-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index cd68dbd7b..b71dbec6d 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -10,7 +10,7 @@
 )
 
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
-gfx94X_flags = {
+amdgpu_flags = {
     "all": [
         "--iree-global-opt-propagate-transposes=true",
         "--iree-opt-outer-dim-concat=true",
@@ -18,11 +18,11 @@
         "--iree-llvmgpu-enable-prefetch=true",
         "--verify=false",
         "--iree-opt-data-tiling=false",
-        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
     ],
     "unet": [
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
     ],
     "clip": [],
     "vae": [],
@@ -110,20 +110,24 @@ def compile_to_vmfb(
 
     if target_triple in ["gfx940", "gfx941", "gfx942", "gfx1100", "gfx90a"]:
         if "unet" in safe_name:
-            flags.extend(gfx94X_flags["unet"])
+            flags.extend(amdgpu_flags["unet"])
         elif any(x in safe_name for x in ["clip", "prompt_encoder"]):
-            flags.extend(gfx94X_flags["clip"])
+            flags.extend(amdgpu_flags["clip"])
         elif "vae" in safe_name:
-            flags.extend(gfx94X_flags["vae"])
-        flags.extend(gfx94X_flags["all"])
+            flags.extend(amdgpu_flags["vae"])
+        flags.extend(amdgpu_flags["all"])
 
+    # Currently, we need a transform dialect script to be applied to the compilation through IREE in certain cases.
+    # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
+    # This is a temporary solution, and should be removed or largely disabled once the functionality of
+    # the TD spec is implemented in C++.
     if attn_spec not in [None, "", " "]:
-        if attn_spec == "default":
+        if attn_spec in ["default", "mfma"]:
             attn_spec = os.path.join(
                 os.path.realpath(os.path.dirname(__file__)),
                 "default_mfma_attn_spec.mlir",
             )
-        flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
+            flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
 
     print("Compiling to", device, "with flags:", flags)
 
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index 7dc976944..21986d229 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -409,7 +409,7 @@ def test03_ExportVaeModelDecode(self):
                 tracy_profile=arguments["tracy_profile"],
             )
         rtol = 4e-2
-        atol = 4e-2
+        atol = 4e-1
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 

From 3461f9156c980fc69c1e903e680e0670fa163681 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Fri, 19 Apr 2024 23:59:22 -0500
Subject: [PATCH 018/174] Remove --decomp_attn from mi250 CI job.

---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index d6d054f7b..8f400eaf8 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -71,5 +71,5 @@ jobs:
           pytest -v models/turbine_models/tests/sd_test.py
           pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu
           pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
-          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default --decomp_attn True
+          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
 

From ed6eca5391a112b655dfa5260700db4871d3996d Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sat, 20 Apr 2024 00:32:16 -0500
Subject: [PATCH 019/174] Reduce number of sdxl inference steps for CPU CI

---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index 8f400eaf8..1d54feabd 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -69,7 +69,7 @@ jobs:
           source turbine_venv/bin/activate
 
           pytest -v models/turbine_models/tests/sd_test.py
-          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu
+          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps=2
           pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
           pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
 

From 2a182aacbd46a0ef99222eadc601978a01aee193 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sat, 20 Apr 2024 15:29:01 -0500
Subject: [PATCH 020/174] turn off const eval on cpu

---
 .github/workflows/test_models.yml                         | 2 +-
 models/turbine_models/custom_models/sd_inference/utils.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index 1d54feabd..0c81e2409 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -69,7 +69,7 @@ jobs:
           source turbine_venv/bin/activate
 
           pytest -v models/turbine_models/tests/sd_test.py
-          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps=2
+          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 5
           pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
           pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
 
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index b71dbec6d..b7f76b8e2 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -57,6 +57,7 @@ def compile_to_vmfb(
                 "--iree-llvmcpu-target-cpu-features=host",
                 "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
                 "--iree-llvmcpu-distribution-size=32",
+                "--iree-opt-const-eval=false",
             ]
         )
         device = "llvm-cpu"

From b8efd3c6fd9f4107210ad7ed0fc2326f55284916 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sat, 20 Apr 2024 17:44:37 -0500
Subject: [PATCH 021/174] Remove xfails from rocm sdxl unet+t2i tests

---
 models/turbine_models/tests/sdxl_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index 21986d229..1f1b310df 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -215,7 +215,7 @@ def test01_ExportClipModels(self):
         np.testing.assert_allclose(torch_output_2, turbine_2[0], rtol, atol)
 
     def test02_ExportUnetModel(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest(
                 "Unknown error on vulkan; Runtime error on rocm; To be tested on cuda."
             )
@@ -321,7 +321,7 @@ def test02_ExportUnetModel(self):
                 tracy_profile=arguments["tracy_profile"],
             )
         rtol = 4e-2
-        atol = 4e-2
+        atol = 4e-1
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
@@ -502,7 +502,7 @@ def test04_ExportVaeModelEncode(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test05_t2i_generate_images(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("Have issues with submodels on these backends")
         mlirs = {
             "vae_decode": None,

From a70921474250b0be8b5fcc499a612a6c7b8060c6 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sat, 20 Apr 2024 22:58:57 -0500
Subject: [PATCH 022/174] xfails

---
 models/turbine_models/tests/sdxl_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index 1f1b310df..4f05e5671 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -215,7 +215,7 @@ def test01_ExportClipModels(self):
         np.testing.assert_allclose(torch_output_2, turbine_2[0], rtol, atol)
 
     def test02_ExportUnetModel(self):
-        if arguments["device"] in ["vulkan", "cuda"]:
+        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
             self.skipTest(
                 "Unknown error on vulkan; Runtime error on rocm; To be tested on cuda."
             )
@@ -502,7 +502,7 @@ def test04_ExportVaeModelEncode(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test05_t2i_generate_images(self):
-        if arguments["device"] in ["vulkan", "cuda"]:
+        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
             self.skipTest("Have issues with submodels on these backends")
         mlirs = {
             "vae_decode": None,

From 65a1cb68cfe4e6f23c99ce859f5883cc4cede99f Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:21:08 -0500
Subject: [PATCH 023/174] Update README.md

---
 models/turbine_models/custom_models/sdxl_inference/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/README.md b/models/turbine_models/custom_models/sdxl_inference/README.md
index 8f82e3e38..4dc9107a4 100644
--- a/models/turbine_models/custom_models/sdxl_inference/README.md
+++ b/models/turbine_models/custom_models/sdxl_inference/README.md
@@ -9,7 +9,7 @@ Following is a table that shows current status of turbine SDXL inference support
 | gfx1100     | Yes                   | 💚 | 💛 (numerics with vector distribution)| 💚                  | 💚  |
 |             | No                    |               | 💔 (Attn lowering) | 💔 (Attn lowering) | 💔 |
 | gfx90a      | Yes                   | 💚 | 💚                  | 💚                  | 💚  |
-|             | No                    |               | 💔 (Shared Memory) | 💚                  | 💔 |
+|             | No                    |               | 💛 (Numerics with mfma) | 💚                  | 💛 |
 | gfx942      | Yes                   | 💚 | 💚                  | 💚                  | 💚  |
 |             | No                    |               | 💚                  | 💚                  | 💚  |
 

From 3d94c81bcfc520ad1be941b74fc9a37720858fee Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Mon, 22 Apr 2024 16:17:45 -0500
Subject: [PATCH 024/174] Update sdxl_test.py

---
 models/turbine_models/tests/sdxl_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index 4f05e5671..1f2cc7c01 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -502,7 +502,7 @@ def test04_ExportVaeModelEncode(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test05_t2i_generate_images(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("Have issues with submodels on these backends")
         mlirs = {
             "vae_decode": None,

From a827c9a3dfce2f0b627b7e664b070e4f05ea526c Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Tue, 23 Apr 2024 12:05:53 -0500
Subject: [PATCH 025/174] Remove vector distribution, pad to intrinsics for now

---
 models/turbine_models/custom_models/sd_inference/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index b7f76b8e2..58091d4e0 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -21,8 +21,7 @@
     ],
     "unet": [
         "--iree-codegen-gpu-native-math-precision=true",
-        "--iree-codegen-llvmgpu-use-vector-distribution",
-        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline)",
     ],
     "clip": [],
     "vae": [],

From cd223e4fa0bd65f022adb82bd7f451271b2abab4 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 23 Apr 2024 13:10:41 -0500
Subject: [PATCH 026/174] Update flags, deepcopy decomposition defaults

---
 models/turbine_models/custom_models/sd_inference/utils.py | 8 +++-----
 .../custom_models/sdxl_inference/sdxl_scheduled_unet.py   | 3 ++-
 .../turbine_models/custom_models/sdxl_inference/unet.py   | 3 ++-
 models/turbine_models/custom_models/sdxl_inference/vae.py | 3 ++-
 models/turbine_models/tests/sdxl_test.py                  | 4 ++--
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 58091d4e0..efc896f86 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -12,16 +12,15 @@
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
 amdgpu_flags = {
     "all": [
+    ],
+    "unet": [
         "--iree-global-opt-propagate-transposes=true",
         "--iree-opt-outer-dim-concat=true",
         "--iree-vm-target-truncate-unsupported-floats",
         "--iree-llvmgpu-enable-prefetch=true",
-        "--verify=false",
         "--iree-opt-data-tiling=false",
-    ],
-    "unet": [
         "--iree-codegen-gpu-native-math-precision=true",
-        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline)",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
     ],
     "clip": [],
     "vae": [],
@@ -78,7 +77,6 @@ def compile_to_vmfb(
                 "--iree-hal-target-backends=rocm",
                 "--iree-rocm-target-chip=" + target_triple,
                 "--iree-opt-const-eval=false",
-                "--iree-opt-data-tiling=False",
             ]
         )
         if target_triple == "gfx942":
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 4c30d5e65..eeac968f7 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -6,6 +6,7 @@
 
 # from @aviator19941's gist : https://gist.github.com/aviator19941/4e7967bd1787c83ee389a22637c6eea7
 
+import copy
 import os
 import sys
 
@@ -165,7 +166,7 @@ def export_scheduled_unet_model(
 
     mapper = {}
 
-    decomp_list = DEFAULT_DECOMPOSITIONS
+    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
     if decomp_attn == True:
         decomp_list.extend(
             [
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index a02a72586..e59a0d79a 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -4,6 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import copy
 import os
 import sys
 
@@ -123,7 +124,7 @@ def export_unet_model(
         return vmfb_path
 
     mapper = {}
-    decomp_list = DEFAULT_DECOMPOSITIONS
+    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
     if decomp_attn == True:
         decomp_list.extend(
             [
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index dd3adb525..b5bb5225f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -4,6 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import copy
 import os
 import sys
 
@@ -104,7 +105,7 @@ def export_vae_model(
         return vmfb_path
 
     mapper = {}
-    decomp_list = DEFAULT_DECOMPOSITIONS
+    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
     if decomp_attn == True:
         decomp_list.extend(
             [
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index 1f2cc7c01..b23e99319 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -215,9 +215,9 @@ def test01_ExportClipModels(self):
         np.testing.assert_allclose(torch_output_2, turbine_2[0], rtol, atol)
 
     def test02_ExportUnetModel(self):
-        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+        if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest(
-                "Unknown error on vulkan; Runtime error on rocm; To be tested on cuda."
+                "Unknown error on vulkan; To be tested on cuda."
             )
         unet.export_unet_model(
             unet_model=self.unet_model,

From 46925c67598fc6ed5a20ed1cd02baade19473d77 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 23 Apr 2024 13:29:02 -0500
Subject: [PATCH 027/174] Formatting

---
 models/turbine_models/custom_models/sd_inference/utils.py | 3 +--
 models/turbine_models/tests/sdxl_test.py                  | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index efc896f86..1bada4c49 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -11,8 +11,7 @@
 
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
 amdgpu_flags = {
-    "all": [
-    ],
+    "all": [],
     "unet": [
         "--iree-global-opt-propagate-transposes=true",
         "--iree-opt-outer-dim-concat=true",
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index b23e99319..e9d099e27 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -216,9 +216,7 @@ def test01_ExportClipModels(self):
 
     def test02_ExportUnetModel(self):
         if arguments["device"] in ["vulkan", "cuda"]:
-            self.skipTest(
-                "Unknown error on vulkan; To be tested on cuda."
-            )
+            self.skipTest("Unknown error on vulkan; To be tested on cuda.")
         unet.export_unet_model(
             unet_model=self.unet_model,
             # This is a public model, so no auth required

From b831643f331f966731f5c327c9d06b62abb3408f Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 23 Apr 2024 15:50:08 -0500
Subject: [PATCH 028/174] xfail t2i test on rocm for now.

---
 models/turbine_models/tests/sdxl_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index e9d099e27..aab83657c 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -500,8 +500,10 @@ def test04_ExportVaeModelEncode(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test05_t2i_generate_images(self):
-        if arguments["device"] in ["vulkan", "cuda"]:
-            self.skipTest("Have issues with submodels on these backends")
+        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+            self.skipTest(
+                "Have issues with submodels on vulkan, cuda; ROCM hangs on mi250 despite submodels working."
+            )
         mlirs = {
             "vae_decode": None,
             "prompt_encoder": None,

From 4120122e3fe7f1edf5aa7050b609c70bdddceae7 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 01:05:36 -0500
Subject: [PATCH 029/174] add return_imgs option to sdxl pipeline image
 generate fn.

---
 .../sdxl_inference/sdxl_compiled_pipeline.py          | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 7c7bf5f48..2fe1a38d3 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -36,6 +36,7 @@
     "vulkan",
     "cuda",
     "rocm",
+    "hip",
 ]
 
 empty_pipe_dict = {
@@ -430,6 +431,7 @@ def generate_images(
         batch_count: int = 1,
         guidance_scale: float = 7.5,
         seed: float = -1,
+        return_imgs: bool = False,
     ):
         # TODO: implement case where this is false e.g. in SDXL Turbo
         # do_classifier_free_guidance = True
@@ -585,12 +587,18 @@ def generate_images(
                     "sec",
                 )
         timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
+        images = []
         for idx, image in enumerate(numpy_images):
             image = torch.from_numpy(image).cpu().permute(0, 2, 3, 1).float().numpy()
             image = numpy_to_pil_image(image)
+            images.append(image[0])
+        if return_imgs:
+            return images
+        for idx, image in enumerate(images):
             img_path = "sdxl_output_" + timestamp + "_" + str(idx) + ".png"
-            image[0].save(img_path)
+            image.save(img_path)
             print(img_path, "saved")
+        return
 
 
 def numpy_to_pil_image(images):
@@ -689,5 +697,6 @@ def numpy_to_pil_image(images):
         args.batch_count,
         args.guidance_scale,
         args.seed,
+        False,
     )
     print("Image generation complete.")

From 5ba3c35e50c6ac58bfaf879af6aab805a307235b Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 01:20:55 -0500
Subject: [PATCH 030/174] Fix default ireec flags.

---
 .../sdxl_inference/sdxl_compiled_pipeline.py          | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 2fe1a38d3..69bd70ca1 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -47,6 +47,13 @@
     "full_pipeline": None,
 }
 
+EMPTY_FLAGS = {
+    "clip": None,
+    "unet": None,
+    "vae": None,
+    "pipeline": None,
+}
+
 
 class SharkSDXLPipeline:
     def __init__(
@@ -61,7 +68,7 @@ def __init__(
         num_inference_steps: int,
         device: str,
         iree_target_triple: str,
-        ireec_flags: dict,
+        ireec_flags: dict = EMPTY_FLAGS,
         attn_spec: str = None,
         decomp_attn: bool = False,
         pipeline_dir: str = "./shark_vmfbs",
@@ -79,7 +86,7 @@ def __init__(
         self.num_inference_steps = num_inference_steps
         self.device = device
         self.iree_target_triple = iree_target_triple
-        self.ireec_flags = ireec_flags
+        self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
         self.attn_spec = attn_spec
         self.decomp_attn = decomp_attn
         self.pipeline_dir = pipeline_dir

From 29e2506bf8491dfe498f5b5be067e5eb0aa941e3 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 01:31:29 -0500
Subject: [PATCH 031/174] Tweak scheduler names in utils

---
 models/turbine_models/custom_models/sd_inference/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 1bada4c49..8fd8fcc63 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -200,11 +200,13 @@ def get_schedulers(model_id):
         model_id,
         subfolder="scheduler",
     )
-    schedulers["Euler"] = EulerDiscreteScheduler.from_pretrained(
+    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
         model_id,
         subfolder="scheduler",
     )
-    schedulers["EulerA"] = EulerAncestralDiscreteScheduler.from_pretrained(
+    schedulers[
+        "EulerAncestralDiscrete"
+    ] = EulerAncestralDiscreteScheduler.from_pretrained(
         model_id,
         subfolder="scheduler",
     )

From 0d6486c2863a9f6eb3ad8c3bd76024b49b6714aa Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 03:06:20 -0500
Subject: [PATCH 032/174] Inline pipeline .mlir in python scripts for pip
 package inclusion.

---
 .../custom_models/sd_inference/utils.py       |   2 +-
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 120 +++++++++++++++---
 .../sdxl_pipeline_bench_f16.mlir              |  23 ----
 .../sdxl_pipeline_bench_f32.mlir              |  23 ----
 .../sdxl_sched_unet_bench_f16.mlir            |  19 ---
 .../sdxl_sched_unet_bench_f32.mlir            |  19 ---
 .../sdxl_inference/sdxl_scheduled_unet.py     |  41 +++---
 7 files changed, 131 insertions(+), 116 deletions(-)
 delete mode 100644 models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f16.mlir
 delete mode 100644 models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f32.mlir
 delete mode 100644 models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f16.mlir
 delete mode 100644 models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f32.mlir

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 8fd8fcc63..5690e5f4a 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -22,7 +22,7 @@
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
     ],
     "clip": [],
-    "vae": [],
+    "vae": ["--verify=false"],
 }
 
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 69bd70ca1..d0be2c004 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -54,6 +54,102 @@
     "pipeline": None,
 }
 
+sdxl_pipeline_bench_f16 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+
+  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
+    return %image : tensor<1x3x1024x1024xf16>
+  } 
+}
+"""
+
+sdxl_pipeline_bench_f32 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+
+  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf32>, %guidance_scale: tensor<1xf32>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf32> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
+      scf.yield %inner : tensor<1x4x128x128xf32>
+    }
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32>
+    return %image : tensor<1x3x1024x1024xf32>
+  } 
+}
+"""
+
+sdxl_sched_unet_bench_f16 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  
+  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf16>, %p_embeds: tensor<2x64x2048xf16>, %t_embeds: tensor<2x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x128x128xf16> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    return %res : tensor<1x4x128x128xf16>
+  } 
+}
+"""
+
+sdxl_sched_unet_bench_f32 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  
+  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf32>, %p_embeds: tensor<2x64x2048xf32>, %t_embeds: tensor<2x1280xf32>, %guidance_scale: tensor<1xf32>) -> tensor<1x4x128x128xf32> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg_s = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg_s, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
+      scf.yield %inner : tensor<1x4x128x128xf32>
+    }
+    return %res : tensor<1x4x128x128xf32>
+  } 
+}
+"""
+
 
 class SharkSDXLPipeline:
     def __init__(
@@ -131,7 +227,7 @@ def check_prepared(
                     print("There was an error generating the necessary files.")
                     exit()
         else:
-            print("All necessary files found. Generating images.")
+            print("All necessary files found. Loading pipeline.")
         return vmfbs, weights
 
     def is_prepared(self, vmfbs, weights):
@@ -341,40 +437,34 @@ def export_submodel(
                 return prompt_encoder_vmfb, prompt_encoder_external_weight_path
             case "pipeline":
                 pipeline_file = (
-                    "sdxl_sched_unet_bench_" + "f32"
+                    sdxl_sched_unet_bench_f32
                     if self.precision == "fp32"
-                    else "sdxl_sched_unet_bench_" + "f16"
+                    else sdxl_sched_unet_bench_f16
                 )
                 pipeline_vmfb = utils.compile_to_vmfb(
-                    os.path.join(
-                        os.path.realpath(os.path.dirname(__file__)),
-                        pipeline_file + ".mlir",
-                    ),
+                    pipeline_file,
                     self.device,
                     self.iree_target_triple,
                     self.ireec_flags["pipeline"],
                     os.path.join(self.pipeline_dir, "pipeline"),
                     return_path=True,
-                    mlir_source="file",
+                    mlir_source="str",
                 )
                 return pipeline_vmfb, None
             case "full_pipeline":
                 pipeline_file = (
-                    "sdxl_pipeline_bench_" + "f32"
+                    sdxl_pipeline_bench_f32
                     if self.precision == "fp32"
-                    else "sdxl_pipeline_bench_" + "f16"
+                    else sdxl_pipeline_bench_f16
                 )
                 pipeline_vmfb = utils.compile_to_vmfb(
-                    os.path.join(
-                        os.path.realpath(os.path.dirname(__file__)),
-                        pipeline_file + ".mlir",
-                    ),
+                    pipeline_file,
                     self.device,
                     self.iree_target_triple,
                     self.ireec_flags["pipeline"],
                     os.path.join(self.pipeline_dir, "full_pipeline"),
                     return_path=True,
-                    mlir_source="file",
+                    mlir_source="str",
                 )
                 return pipeline_vmfb, None
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f16.mlir b/models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f16.mlir
deleted file mode 100644
index 523d09fa6..000000000
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f16.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-
-  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
-    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
-      scf.yield %inner : tensor<1x4x128x128xf16>
-    }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
-    return %image : tensor<1x3x1024x1024xf16>
-  } 
-}
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f32.mlir b/models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f32.mlir
deleted file mode 100644
index 669df73b2..000000000
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_pipeline_bench_f32.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-
-  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf32>, %guidance_scale: tensor<1xf32>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf32> {
-    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
-      scf.yield %inner : tensor<1x4x128x128xf32>
-    }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32>
-    return %image : tensor<1x3x1024x1024xf32>
-  } 
-}
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f16.mlir b/models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f16.mlir
deleted file mode 100644
index b12fc82b9..000000000
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f16.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  
-  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf16>, %p_embeds: tensor<2x64x2048xf16>, %t_embeds: tensor<2x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x128x128xf16> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
-      scf.yield %inner : tensor<1x4x128x128xf16>
-    }
-    return %res : tensor<1x4x128x128xf16>
-  } 
-}
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f32.mlir b/models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f32.mlir
deleted file mode 100644
index fbc69f854..000000000
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_sched_unet_bench_f32.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  
-  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf32>, %p_embeds: tensor<2x64x2048xf32>, %t_embeds: tensor<2x1280xf32>, %guidance_scale: tensor<1xf32>) -> tensor<1x4x128x128xf32> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg_s = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg_s, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
-      scf.yield %inner : tensor<1x4x128x128xf32>
-    }
-    return %res : tensor<1x4x128x128xf32>
-  } 
-}
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index eeac968f7..210a0844c 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -254,31 +254,40 @@ def run_forward(
 
 
 def export_pipeline_module(args):
+    from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
+        sdxl_pipeline_bench_f16,
+        sdxl_pipeline_bench_f32,
+        sdxl_sched_unet_bench_f16,
+        sdxl_sched_unet_bench_f32,
+    )
+
     pipeline_file = (
-        "sdxl_sched_unet_bench_" + "f32"
+        sdxl_sched_unet_bench_f32
         if args.precision == "fp32"
-        else "sdxl_sched_unet_bench_" + "f16"
+        else sdxl_sched_unet_bench_f16
+    )
+    pipeline_vmfb = utils.compile_to_vmfb(
+        pipeline_file,
+        args.device,
+        args.iree_target_triple,
+        None,
+        os.path.join(args.pipeline_dir, "pipeline"),
+        return_path=True,
+        mlir_source="str",
     )
-    if "turbo" in args.hf_model_name:
-        pipe_prefix = "sdxl_turbo_pipeline_bench_"
-    else:
-        pipe_prefix = "sdxl_pipeline_bench_"
     full_pipeline_file = (
-        pipe_prefix + "f32" if args.precision == "fp32" else pipe_prefix + "f16"
+        sdxl_pipeline_bench_f32 if args.precision == "fp32" else sdxl_pipeline_bench_f16
     )
-    full_pipeline_vmfb_path = utils.compile_to_vmfb(
-        os.path.join(
-            os.path.realpath(os.path.dirname(__file__)), full_pipeline_file + ".mlir"
-        ),
+    full_pipeline_vmfb = utils.compile_to_vmfb(
+        pipeline_file,
         args.device,
         args.iree_target_triple,
-        args.ireec_flags,
-        "sdxl_full_pipeline_" + args.precision + "_" + args.iree_target_triple,
+        None,
+        os.path.join(args.pipeline_dir, "pipeline"),
         return_path=True,
-        const_expr_hoisting=False,
-        mlir_source="file",
+        mlir_source="str",
     )
-    return full_pipeline_vmfb_path
+    return full_pipeline_vmfb
 
 
 if __name__ == "__main__":

From 13b068d06aa755f419060ce57ec8b86538233fce Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 16:42:45 -0500
Subject: [PATCH 033/174] Add attention decomposition option to
 stateless_llama.py

---
 .../custom_models/stateless_llama.py          | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index baa4e2348..766ce24c2 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -2,6 +2,7 @@
 import sys
 import re
 import json
+import copy
 from turbine_models.turbine_tank import turbine_tank
 
 os.environ["TORCH_LOGS"] = "dynamic"
@@ -9,6 +10,7 @@
 import torch
 from torch.utils import _pytree as pytree
 from shark_turbine.aot import *
+from shark_turbine.aot import decompositions
 from iree.compiler.ir import Context
 from turbine_models.custom_models.llm_optimizations.streaming_llm.modify_llama import (
     enable_llama_pos_shift_attention,
@@ -62,6 +64,11 @@
     action="store_true",
     help="Compile LLM with StreamingLLM optimizations",
 )
+parser.add_argument(
+    "--decomp_attn",
+    action="store_true",
+    help="Decompose attention ops at fx graph level.",
+)
 
 
 def generate_schema(num_layers):
@@ -123,6 +130,7 @@ def export_transformer_model(
     upload_ir=False,
     mod=None,
     tokenizer=None,
+    decomp_attn=False,
 ):
     if tokenizer == None:
         tokenizer = AutoTokenizer.from_pretrained(
@@ -175,6 +183,18 @@ def export_transformer_model(
             tensor_mapper = remap_gguf.TensorNameMap(remap_gguf.MODEL_ARCH.LLAMA, HEADS)
             mapper = tensor_mapper.mapping
 
+    initial_table = decompositions.current_aot_decompositions()
+    if decomp_attn == True:
+        with decompositions.extend_aot_decompositions(from_current=True) as init_t:
+            with decompositions.extend_aot_decompositions(
+                add_ops=[
+                    torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+                    torch.ops.aten._scaled_dot_product_flash_attention.default,
+                ]
+            ):
+                current_table = decompositions.current_aot_decompositions()
+                assert len(current_table) == len(initial_table) + 1
+
     class StateUpdateModule(CompiledModule):
         if external_weights:
             params = export_parameters(
@@ -498,6 +518,8 @@ def evict_kvcache_space(self):
         args.vulkan_max_allocation,
         args.streaming_llm,
         args.vmfb_path,
+        upload_ir=False,
+        decomp_attn=args.decomp_attn,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)

From f1e150c81436760ff7c5f5f1c040e7b91494000e Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 17:08:59 -0500
Subject: [PATCH 034/174] Remove old rocm flags

---
 models/turbine_models/custom_models/stateless_llama.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 766ce24c2..f9dfcc2dc 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -467,8 +467,6 @@ def evict_kvcache_space(self):
             flags.extend(
                 [
                     "--iree-rocm-target-chip=" + target_triple,
-                    "--iree-rocm-link-bc=true",
-                    "--iree-vm-bytecode-module-strip-source-map=true",
                     "--iree-opt-strip-assertions=true",
                     "--iree-vm-target-truncate-unsupported-floats",
                 ]

From 0577d554ea45337c58420160c6b89e2091c6ea95 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 21:51:59 -0500
Subject: [PATCH 035/174] Add a few options for argmax to stateless_llama

---
 .../custom_models/llama_argmax_td_spec.mlir   | 169 ++++++++++++++++++
 .../custom_models/stateless_llama.py          |  15 +-
 2 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 models/turbine_models/custom_models/llama_argmax_td_spec.mlir

diff --git a/models/turbine_models/custom_models/llama_argmax_td_spec.mlir b/models/turbine_models/custom_models/llama_argmax_td_spec.mlir
new file mode 100644
index 000000000..0ef957cb3
--- /dev/null
+++ b/models/turbine_models/custom_models/llama_argmax_td_spec.mlir
@@ -0,0 +1,169 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// The configuration used for executable compilation.
+// This specifies the device configurations that support this custom kernel.
+#rocm_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100", ukernels = "none"}>
+
+module attributes {transform.with_named_sequence} {
+  util.func private @argmax_1d_f32_entry_point(%arg0: tensor<1x?xf32>) -> tensor<1xi64> {
+    %c1 = arith.constant 1 : index
+    %dim = tensor.dim %arg0, %c1 : tensor<1x?xf32>
+    // Note: This is not safe if the dim size exceeds INT32_MAX. To pass a 64
+    // bit value it must be broken down into two 32-bit values for the high and
+    // low bits.
+    %dim_i32 = arith.index_cast %dim : index to i32
+    // Inline external dispatch that conforms to the ABI that the kernel
+    // requires. This is the primary reason for the surrounding function as
+    // details like tensor shape and push constants need to line up after
+    // splicing in the custom dispatch. This allows the kernel author to manage
+    // such details by hand without needing the rewrite patterns to worry about
+    // things like order of push constants.
+    %4 = hal.dispatch.extern "argmax_F32I64"[%dim](%dim_i32, %arg0) : (i32, tensor<1x?xf32>{%dim}) -> tensor<1xi64>
+      count(%device: !hal.device, %workload: index) -> (index, index, index) {
+        %c1_0 = arith.constant 1 : index
+        hal.return %c1_0, %c1_0, %c1_0 : index, index, index
+      }
+      layout(#hal.pipeline.layout<push_constants = 1, sets = [
+        <0, bindings = [
+            <0, storage_buffer, ReadOnly>,
+            <1, storage_buffer>
+        ]>
+      ]>)
+      bindings([
+        #hal.interface.binding<0, 0>,
+        #hal.interface.binding<0, 1>
+      ])
+      objects({
+        #rocm_target ordinal(0) = [
+          #hal.executable.object<{
+            data = dense<"0x7f454c460201014003000000000000000300e0000100000000000000000000004000000000000000208f0000000000004100000040003800090040000f000d000600000004000000400000000000000040000000000000004000000000000000f801000000000000f801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000004c0c0000000000004c0c00000000000000100000000000000100000005000000000d000000000000001d000000000000001d000000000000007d000000000000007d00000000000000100000000000000100000006000000008a00000000000000aa00000000000000aa0000000000007000000000000000000600000000000000100000000000000100000006000000708a00000000000070ba00000000000070ba000000000000000000000000000091a801000000000000100000000000000200000006000000008a00000000000000aa00000000000000aa00000000000070000000000000007000000000000000080000000000000052e5746404000000008a00000000000000aa00000000000000aa00000000000070000000000000000006000000000000010000000000000051e57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000b806000000000000b806000000000000040000000000000007000000a306000020000000414d44475055000083ae616d646873612e6b65726e656c7391de0012a52e61726773dc001686ae2e61637475616c5f616363657373a9726561645f6f6e6c79ae2e616464726573735f7370616365a6676c6f62616ca52e6e616d65b2696e7075744275666665722e636f65726365a72e6f666673657400a52e73697a6508ab2e76616c75655f6b696e64ad676c6f62616c5f62756666657286ae2e61637475616c5f616363657373aa77726974655f6f6e6c79ae2e616464726573735f7370616365a6676c6f62616ca52e6e616d65b36f75747075744275666665722e636f65726365a72e6f666673657408a52e73697a6508ab2e76616c75655f6b696e64ad676c6f62616c5f62756666657284a52e6e616d65ad726564756374696f6e53697a65a72e6f666673657410a52e73697a6504ab2e76616c75655f6b696e64a862795f76616c756583a72e6f666673657418a52e73697a6504ab2e76616c75655f6b696e64b468696464656e5f626c6f636b5f636f756e745f7883a72e6f66667365741ca52e73697a6504ab2e76616c75655f6b696e64b468696464656e5f626c6f636b5f636f756e745f7983a72e6f666673657420a52e73697a6504ab2e76616c75655f6b696e64b468696464656e5f626c6f636b5f636f756e745f7a83a72e6f666673657424a52e73697a6502ab2e76616c75655f6b696e64b368696464656e5f67726f75705f73697a655f7883a72e6f666673657426a52e73697a6502ab2e76616c75655f6b696e64b368696464656e5f67726f75705f73697a655f7983a72e6f666673657428a52e73697a6502ab2e76616c75655f6b696e64b368696464656e5f67726f75705f73697a655f7a83a72e6f66667365742aa52e73697a6502ab2e76616c75655f6b696e64b268696464656e5f72656d61696e6465725f7883a72e6f66667365742ca52e73697a6502ab2e76616c75655f6b696e64b268696464656e5f72656d61696e6465725f7983a72e6f66667365742ea52e73697a6502ab2e76616c75655f6b696e64b268696464656e5f72656d61696e6465725f7a83a72e6f666673657440a52e73697a6508ab2e76616c75655f6b696e64b668696464656e5f676c6f62616c5f6f66667365745f7883a72e6f666673657448a52e73697a6508ab2e76616c75655f6b696e64b668696464656e5f676c6f62616c5f6f66667365745f7983a72e6f666673657450a52e73697a6508ab2e76616c75655f6b696e64b668696464656e5f676c6f62616c5f6f66667365745f7a83a72e6f666673657458a52e73697a6502ab2e76616c75655f6b696e64b068696464656e5f677269645f64696d7383a72e6f666673657468a52e73697a6508ab2e76616c75655f6b696e64b668696464656e5f686f737463616c6c5f62756666657283a72e6f666673657470a52e73697a6508ab2e76616c75655f6b696e64b968696464656e5f6d756c7469677269645f73796e635f61726783a72e6f666673657478a52e73697a6508ab2e76616c75655f6b696e64ae68696464656e5f686561705f763183a72e6f6666736574cc80a52e73697a6508ab2e76616c75655f6b696e64b468696464656e5f64656661756c745f717565756583a72e6f6666736574cc88a52e73697a6508ab2e76616c75655f6b696e64b868696464656e5f636f6d706c6574696f6e5f616374696f6e83a72e6f6666736574cce0a52e73697a6508ab2e76616c75655f6b696e64b068696464656e5f71756575655f707472b92e67726f75705f7365676d656e745f66697865645f73697a6500b62e6b65726e6172675f7365676d656e745f616c69676e08b52e6b65726e6172675f7365676d656e745f73697a65cd0118a92e6c616e6775616765a84f70656e434c2043b12e6c616e67756167655f76657273696f6e920200b82e6d61785f666c61745f776f726b67726f75705f73697a65cd0400a52e6e616d65ad6172676d61785f463332493634bb2e707269766174655f7365676d656e745f66697865645f73697a65ccc0ab2e736770725f636f756e7424b12e736770725f7370696c6c5f636f756e7428a72e73796d626f6cb06172676d61785f4633324936342e6b64b82e756e69666f726d5f776f726b5f67726f75705f73697a6501b32e757365735f64796e616d69635f737461636bc2ab2e766770725f636f756e7420b12e766770725f7370696c6c5f636f756e744eaf2e7761766566726f6e745f73697a6520b92e776f726b67726f75705f70726f636573736f725f6d6f646501ad616d646873612e746172676574ba616d6467636e2d616d642d616d646873612d2d67667831313030ae616d646873612e76657273696f6e920102000000000000000000000000000000000000000000000000000100000022030700001d000000000000fc000000000000001700000022030700fc1d000000000000c0000000000000002f00000022030700bc1e0000000000004815000000000000a9000000220307007c860000000000006000000000000000cf00000011030600000c0000000000004000000000000000e000000011000a00006302000000000001000000000000005e00000022030700b43b000000000000a80900000000000070000000220307005c45000000000000c40300000000000084000000220307002049000000000000b83a0000000000009300000022030700d883000000000000780100000000000048000000220307000434000000000000b007000000000000c100000012030700008700000000000044110000000000000300000001000000040000001a00000000000090021000000c060000400002000000000001000a80000000808200a06301000000070000000b000000b02f30fd66a54e0f20613c71de07d6f6de6bdee749a5522bf447dcf8e09bd59ef67b32e2a134fbcc2ca8677d43090ac70d0000000d0000000c00000004000000000000000b00000000000000060000000a00000000000000070000000300000000000000000000000800000000000000000000000000000000000000000000000000000000000000010000000500000002000000090000000000000000000000005f5f6f636b6c5f6465766d656d5f72657175657374005f5f6f636b6c5f686f737463616c6c5f70726576696577005f5f6f636b6c5f686f737463616c6c5f696e7465726e616c005f5f6f636b6c5f6873615f7369676e616c5f616464005f5f6f636b6c5f646d5f696e69745f7631005f5f6f636b6c5f6765745f6c6f63616c5f6964005f5f6f636b6c5f646d5f7472696d005f5f6f636b6c5f6163746976656c616e655f753332005f5f6f636b6c5f73616e6974697a65725f7265706f7274006172676d61785f463332493634006172676d61785f4633324936342e6b64005f5f6869705f637569645f366134356234353732363564653237310000000000c00000001801000000000000007b0000000000000000000000000000000000000000000000000000000000000301af609b1300001a04000000000000000000000000000090010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000089bf210080be2000a1bec12281be000069dc002821000100febe280061d70004010020902081280061d71e000100280061d71f0201000303087e0203067e0103047e0003027e03030a7e01030a7e004780be00ff0080a800000001ff0182000000008302007e8002207e10030a7e10030c7e10030e7e1003107e1003127e1003147e1003167e1003187e10031a7e10031c7e10031e7e00499ebe040069dc00012100040051dc000021030002027ef70389bf0303047ea00080be01003dd7000202001f0060d7280301001e0060d728010100000060d728050100c12281be000051dc000021280100febe20d020810000a1bef70389bf1e4880be000089bf1003247e0f03227e0e03207e0d031e7e0c031c7e0b031a7e0a03187e0903167e0803147e0703127e0603107e05030e7e04030c7e03030a7e0203087e0103067e0003047e004780be00ff008000eeffff01ff0182ffffffff000000f4000000f8ff0081bef401000007fc89bf000104bfff0081be50000000980080be00010098040004f40000000007fc89bf000083bea00082be00028085000082be004780be00ff00801c00000001ff0182000000000302007e0202027e004880be000089bf210080be2000a1bec12481be340169dc00132100380169dc000021003c0169dc00022100400169dc00032100440169dc00052100c100febe000069dc002821000100febe280061d722040100280061d70006010020ff208150010000280061d71e000100280061d71f020100800069dc001f2100130061d706000100130061d707020100c122a2be040069dc001321002200febe7c0069dc001221001103247e7c0051dc00002111780069dc001021000f03207e780051dc0000210f740069dc000e21000d031c7e740051dc0000210d700069dc000c21000b03187e700051dc0000210b6c0069dc000a21000903147e6c0051dc00002109680069dc000821000703107e680051dc00002107640069dc0006210005030c7e640051dc00002105600069dc000421000303087e600051dc000021035c0069dc000221000003047ec122a2be040051dc000021002200febef70389bf000061d70f040100000061d70e060100000061d70d080100000061d70c0a0100000061d70a0c0100000061d70b0e0100000061d708100100000061d709120100000061d704140100000061d7051601001103267e0f03227e0d031e7e0b031a7e0903167e0703127e05030e7e03030a7e0103067e54006ddc001221004c006ddc0010210044006ddc000e21003c006ddc000c210034006ddc000a21002c006ddc0008210024006ddc000621001c006ddc00042100800081bec10080be01001fd700020000010020d7000202000103087e180069dc000421000105007e10006ddc0002210001004ad401010000010080be000061d700180100800182be8002027e8002047e08006ddc000121007e0080be000061d7001a0100c122a2be040069dc000021002200febe0001008b0000febe0101a5bfc122a2be040051dc000021002200febe100055dc00002101980182bef70389bf0103067e020081be0203087e030080be030100d703030000050020d5040104000503087e9c006ddc00032100184056dc01007c03f70389bf0000b0e0000000000000ace000000000a80182be01030a7e020081be02030c7e030080be050100d705030000070020d50601040007030c7e94006ddc00052100000056dc01007c06280056dc01007c0b0403127ef70389bf0c030a7e05001bd5051302000303147e0b03107e0d001bd50815020005031c7e0d030a7e980081be0b00fed6050300020c031e7e00020a7e0503207ea00080be0d003dd7001a02000d030a7e0d01fed605033c0401020a7e05031c7e0e003cd7001a02000f03107e0b03187e800080be80020a7e05031a7e0d030a7e05001cd5051102000e03167e0c03107e0b001cd5081702000503187e06030a7e0b03107e07030c7e0c030e7e050000d705110200070020d5060f020007030c7e004056dc05007c05f70389bf0603167e0b030c7e0a030e7e0903107e18400add01057c01f70389bf0000b0e0000000000000ace00000000001005dd401070200800080be000061d7001c01000203087e0103067e8c006ddc0003210084006ddc000121007e0080be000061d7001e0100c122a2be040069dc000021002200febe0001008b0000febe7600a5bfc122a2be040051dc000021002200febef70389bf010060d7001d01008c0055dc000021039c0055dc00002101940055dc00002108100055dc00002105010083bff70389bf000056dc05007c06000056dc08007c0b0403127ef70389bf0c030a7e05001bd5051302000303147e0b03107e0d001bd50815020005031c7e0d030a7e980082be0b00fed6050500020c031e7e00020a7e0503207ea00080be0d003dd7001a02000d030a7e0d02fed605053c0402020a7e05031c7e0e003cd7001a02000f03107e0b03187e800080be80020a7e05031a7e0d030a7e05001cd5051102000e03167e0c03107e0b001cd5081702000503187e06030a7e0b03107e07030c7e0c030e7e050000d705110200070020d5060f020007030c7e004056dc05007c05f70389bf0603167e0b030c7e0a030e7e0903107e00400add01057c01f70389bf0000b0e0000000000000ace00000000000005ad4010702000001008c000081be000061d7011c01000203087e0103067e8c006ddc00032100a4006ddc00012100000081be000061d701200100c122a2be040069dc000021002200febe7e007e9197ffa6bfc122a2be040051dc000021002200febef70389bf000060d7002101007e007e8ca40055dc00002100f70389bf84006ddc00002100c122a2be040051dc000021022200febef70389bf000060d7021f01007e007e8c840055dc00002100f70389bf08006ddc00002100c122a2be040051dc000021002200febef70389bf000060d7001b01007e007e8c010060d700190100100055dc00002101080055dc00002105f70389bf0503087ea00080be05003dd7000a02000503067e0405087e0305047e050083be02008284800080be000085be0204828c000061d702220100000061d703240100000056dc01007c03f70389bfc4006ddc00032100a80186be0103067e060084be0203087e070080be030400d703090000050020d5040110000503087ebc006ddc00032100280056dc01007c04030080bef70389bf0503067e03001bd500060200020080be04001bd50008020003030a7e05030e7e04030c7eb4006ddc00062100080056dc01007c028c0080be05003cd700080200f70389bf0203027e0503087e0303047e0603067e010000d701090200030020d5020702000303047eac006ddc000121007e0182be000061d702260100000061d7032801007e0080be000061d7002a0100c122a2be040069dc000021002200febe0001008b0000febe4000a5bfc122a2be040051dc000021032200febef70389bf000060d703270100010060d7032901005c0051dc00002102c40055dc00002107b40055dc00002100f70389bf00030a7e980083be0302fed6050700020403127e02020a7e0503147ea00082be00003dd70200020000030a7e0003fed60507240403020a7e0503027e09003cd7020002000a03027e0303087e800082be8002007e00030a7e0503007e00001cd5000302000903067e0403027e05001cd50107020000030c7e0703007e0503087e0803027e0603067e000200d700090200030220d501070a000303027e10006adc00027c000102067e0002047e08006edc00027c008102047e14006adc00027c00c122a2be040051dc000021002200febef70389bf000060d7002b01007e007e8c010060d700190100540055dc000021034c0055dc00002105440055dc000021073c0055dc00002109340055dc0000210b2c0055dc0000210d240055dc0000210f1c0055dc00002111ac0055dc00002117180051dc00002101800080be8002267e1303047e860080bef70389bf15003cd7000202001703027e1503287e1803047e1603267e010000d701290200130020d5022702001303047ed4006ddc0001210000006edc01117c00880184be0103227e040082be0203247e050080be110200d711050000130020d5120108001303247ecc006ddc0011210008006edc010f7c0010006edc010d7c0018006edc010b7c0020006edc01097c0028006edc01077c0030006edc01057c0038006edc01037c007e0080be000061d7002c0100c122a2be040069dc000021002200febe0001008b0000febeb300a5bfc122a2be040051dc000021002200febef70389bf020060d700230100030060d700250100100055dc00002101c40055dc00002106bc0055dc00002108a00184bef70b89bf0103067e040081be0203087e050080be030100d703030000050020d5040104000503087eec006ddc00032100204056dc01007c03f70789bf000056dc08007c08030080bef70389bf09030a7e05001bd505010000020081be08001bd5080300000503127e08030a7e980083be0a02fed6050700020b03187e02020a7e05031a7ea00082be08003dd70210020008030a7e0803fed60507300403020a7e0503127e08003cd7021002000903187e800082be80020a7e0503167e0b030a7e05001cd5051902000803127e0a03107e09001cd5081302000503147e06030a7e0903107e07030c7e0a030e7e050200d705110200070220d5060f0a0007030c7ee4006ddc0005210000006edc05037c000403127e0303147e01020a7e0002167e0b030c7e0a030e7e0903107e00007cbc20400add01057c01f70389bf02005ad40107020001005dd401070200800080be000061d7022e0100000061d700300100dc006ddc000121007e0080be000061d700320100c122a2be040069dc000021002200febe0001008b0000febe4800a5bfc122a2be040051dc000021002200febef70389bf000060d700310100010060d7002f0100020060d700230100030060d700250100dc0055dc00002103ec0055dc00002101e40055dc00002105010083bff70389bf00006edc05037c000403127e0303147e030081be02020a7e0102167e0b030c7e0a030e7e0903107e00007cbc00400add01057c01f70389bf01005ad4010702000100008c000061d7012e0100000081be000061d701300100dc006ddc00012100000081be000061d701340100c122a2be040069dc000021002200febe7e007e91caffa6bfc122a2be040051dc000021002200febef70389bf000060d7003501007e007e8c0900a0bfc122a2be040051dc000021002200febef70389bf000060d7002d01007e007e8c3700a0bfc122a2be040051dc000021022200febef70389bf000060d7023301007e007e8c0f0060d7020501000e0060d7020701000d0060d7020901000c0060d7020b01000a0060d7020d01000b0060d7020f0100080060d702110100090060d702130100060060d702010100070060d702030100040060d702150100050060d702170100800051dc0000211f100055dc00002100f70389bf100056dc00007c02a00080bef70389bf00003dd7000402000003027e0203007e004780be00ff00803c07000001ff0182000000008102047e8002067e8302087e00499ebec0ffa0bfc40055dc00002106b40055dc00002100f70389bf0003047e980081be0300fed6020300020403107e0002047e0203127ea00080be00003dd7000002000003047e0001fed6020320040102047e0203027e01003cd70000020002030a7e800080be8002007e0003087e0403007e00001cd5000b02000103047e0303027e04001cd50105020000030a7e0603027e0403067e0703007e0503047e010000d701070200000020d5000502000003047e940182be0103007e020081be0203027e030080be000100d700030000020020d5010104000203027ef4006ddc00002100c122a2be040051dc000021002200febef70389bf010060d700190100810080be8102027efc0069dc000121007e0080be000061d700360100c122a2be040069dc000021002200febe0001008b0000febe0f00a5bff40055dc00002100f70389bf004052dc00007c00f70389bf0000b0e0000000000000ace000000000810080be00001bd500010000fc0069dc00002100c122a2be040051dc000021002200febef70389bf000060d7003701007e007e8cfc0051dc00002101f70389bf0105027ec10080be800082be010206bf000061d7003801007e00a2bec100febe040069dc000021002200febe0d00a2bfc122a2be040051dc000021002200febe010083bf800080bef70389bf000061d700380100c122a2be040069dc000021002200febec122a2be040051dc000021002200febef70389bf000060d700390100000001d580020100810080be00004dd4000100007e006a8baaffa4bfc122a2be040051dc000021002200febef70389bf010060d700190100cc0055dc00002101d40055dc00002103f70389bf000056dc03007c03f70389bf08016ddc00032100000056dc01007c01f70389bf00016ddc000121007e0080be000061d7003a0100c122a2be040069dc000021002200febe0001008b0000febee800a5bfc122a2be040051dc000021032200febef70389bf020060d703230100030060d703250100c122a2be100151dc000021002200febe100055dc00002101bc0055dc00002104f70389bf000056dc04007c0d810184bef70389bf0d03087e040081be0e030a7e050080be040100d704030000060020d50501040006030a7e04030c7e020081be05030e7e030080be060100d706030000080020d50701040008030e7e800180be00005ad4060100000503127e0703107e0c0001d50813020004030a7e0603087e060001d5040b02000603147e0c03167e0a03087e0b030a7e2c016ddc00042100980182be0103087e020081be02030a7e030080be040100d704030000070020d50501040007030a7e24016ddc00042100184056dc01007c04000056dc01007c080b030e7e0e031e7e07001bd5071f02000d03167e0a001bd50a1702000703167e0a030e7e980081be0d00fed6070300020e031e7e00020e7e0703207ea00080be0a003dd7001402000a030e7e0a01fed607033c0401020e7e0703167e0a003cd7001402000b031e7e800080be80020e7e07031c7e0e030e7e07001cd5071f02000a03167e0d03147e0d001cd50a17020007031c7ef70389bf08030e7e0d03147e0903107e0e03127e070000d707150200090020d5081302000903107e1c016ddc0007210000006edc07047c000503147e0403167e0c030e7e0b03107e0a03127e00007cbc18400add01067c01f70389bf02005ad40109020001005dd401090200800080be030061d7023c0100030061d7003e0100c122a2be040069dc000321002200febe14016ddc000121007e0080be000061d700000100c122a2be100169dc000021002200febe0001008b0000febe4200a5bfc122a2be040051dc000021032200febef70389bf000060d7033f0100010060d7033d0100c122a2be100151dc000021002200febe140155dc00002104240155dc000021012c0155dc000021061c0155dc00002108010083bff70389bf00006edc08047c000503147e0403167e0703187e0c030e7e0b03107e0a03127e00007cbc00400add01067c01f70389bf01005ad4010902000100008c030061d7013c0100000081be030061d7013e0100c122a2be040069dc000321002200febe14016ddc00012100000081be000061d701020100c122a2be100169dc000021002200febe7e007e91c6ffa6bfc122a2be100151dc000021002200febef70389bf000060d7000301007e007e8cc122a2be100151dc000021002200febef70389bf000060d7000101007e007e8cc122a2be040051dc000021052200febef70389bf000060d7053b01007e007e8cc122a2be100151dc000021042200febe000155dc00002106080155dc00002101f70389bf0103007e0203027e0603047e0703067e1f0060d7280301001e0060d728010100220060d728050100000060d728070100c12481be340151dc00002113380151dc000021003c0151dc00002102400151dc00002103440151dc00002105c100febe000051dc000021280100febe20ff2081b0feffff0000a1bef70389bf1e4880be000089bfc12480be300069dc000020000000febe200069dc000420001c0069dc000320000203067e1c0051dc00002002180069dc000120000003027e180051dc00002000f70789bf0203087ef70389bf0003047e10006ddc000320000203087e0103067e08006ddc00032000880182be0103007e020081be0203027e030080be000100d700030000020020d5010104000203027e00006ddc00002000c12285be240051dc000020000500febe200051dc00002001830080bef70389bf000044d401010000800081be000061d7010001007e0081be0100008b0001018d000061d701020100c12285be240069dc000020000500febe0000febe1900a5bfc12285be240051dc000020000500febe200051dc00002001840080bef70389bf000044d401010000800081be000061d7010401007e0081be0100008b0001018d000061d701060100c12285be240069dc000020000500febe0000febe2501a5bf1b00a0bfc12285be240051dc000020000500febef70389bf000060d700030100002280be020060d700010100000061d702080100800081be000061d7020a0100000061d7010c01007e00008b000061d7000e0100c12285be240069dc000020000500febe7e007e8dc500a5bf2a00a0bfc12285be240051dc000020000500febe200051dc00002001850080bef70389bf01004ad401010000c10080be000061d7001001007e0080be000061d700120100c12285be240069dc000020000500febe0001008b0000febed200a5bf0701a0bfc12285be240051dc000020000500febef70389bf010060d7001501007e017e8c000060d700170100007e008b000061d700000100c12285be240069dc000020000500febebbffa0bfc12285be240051dc000020000500febe200051dc00002001820080bef70389bf000044d4010100007e0081be0100008b0001018d000061d701180100c12285be240069dc000020000500febe0000febe0100a5bfa000a0bfc12285be240051dc000020000500febef70389bf000060d700190100002280be020060d700090100800081be000061d7021a0100000061d7011c01007e00008b000061d7001e0100c12285be240069dc000020000500febe7e007e8d1a00a5bfc12285be240051dc000020000500febef70389bf010060d700090100200051dc00002001810080bef70389bf020041d401010000c10080be7e0080be017e0191027e028b0102018c000061d7011a0100000061d7001c0100c12285be240069dc000020000500febec12285be240051dc000020000500febef70389bf030060d7001f01007e037e8c010060d700090100020060d7001b0100000060d7001d0100007e008b017e0191027e028b0102018c000061d7010a0100000061d7000c0100c12285be240069dc000020000500febe3700a0bfc12285be240051dc000020000500febef70389bf000060d700210100000055dc00002001100055dc00002003f70389bf00000edd01037c00800081be007e0091000061d700220100c12285be240069dc000020000500febec12285be240051dc000020000500febef70389bf000060d7002501007e007e8c010060d7002301007e0080be000061d700260100c12285be240069dc000020000500febe0001008b0000febe8200a5bf000055dc00002000100055dc00002002f70389bf00000edd00027c0000007cbc0000b0e0000000000000ace0000000007500a0bfc12285be240051dc000020000500febef70389bf020060d7000f01007e027e8c000060d7000b0100010060d7000d0100000061d701200100000061d7012201007e0081be0100008b0001018d000061d701240100c12285be240069dc000020000500febe0000febec4ffa5bfadffa0bf000055dc00002000100055dc00002002f70389bf00007cbc00000edd00027c0057ffa0bfc12285be240051dc000020000500febef70389bf010060d7001301007e017e8c000060d700110100007e008b000061d700040100c12285be240069dc000020000500febe0e00a0bf000055dc00002000100055dc00002002f70389bf00007cbc00000edd00027c0000007cbc0000b0e0000000000000ace0000000000fffa0bfc12285be240051dc000020000500febef70389bf000060d700070100002280be010060d700050100000061d7011601007e00008b000061d700140100c12285be240069dc000020000500febe7e007e8dfafea5bfdcffa0bfc12285be240051dc000020000500febe000055dc00002001100055dc00002003f70389bf00007cbc00000edd01037c0000007cbc0000b0e0000000000000ace000000000800080be7ec1008d000061d700100100c12285be240069dc000020000500febeb0ffa0bfc12285be240051dc000020000500febef70389bf000060d7002701007e007e8c080055dc00002001f70389bf100056dc01007c01f70389bf28006ddc00012000800180be01005dd4010100007e0080be000061d700280100c12285be240069dc000020000500febe0001008b0000febe1500a5bf280055dc00002001080055dc00002003f70389bf180052dc03007c00800080be80020a7ef70389bf0003067e0503087e00007cbc00006edc01037c000005007eff0081beff0000000001008b0000fdbe0100b6bfc12285be240051dc000020000500febef70389bf000060d7002901007e007e8cc12480be300051dc000020000000febe070089bf1e4880be000089bf210080be2000a1bec12481be380069dc00002100c100febe000069dc002821000100febe280061d722040100280061d70006010020c02081280061d71e000100280061d71f0201002c0069dc00052100140069dc00042100280069dc000321000103087e280051dc00002101f70389bf0103067e0403027e20006ddc0002210018006ddc00002100004780be00ff00801809000001ff0182000000008002007e100069dc0000210000499ebe140051dc000021040003047ec122a2be040051dc000021002200febe01030a7e100051dc000021010503067e08006ddc00022100f70389bf01004dd4040302007e0080be000061d700000100c122a2be040069dc000021002200febe0001008b0000febeca01a5bfc122a2be040051dc000021002200febe180055dc00002105080055dc00002101840080bef70389bf010018d500020200800080be8002067e0303047e0103067e0503087e0203027e0603047e030000d703090200010020d5010502000103087e800084be000061d704020100040080be040081be040082be040083be000061d700040100000061d701060100000061d702080100000061d7030a0100c122a2be040069dc000021002200febe0302107e02020e7e01020c7e00020a7e000076dc03057c000303007e340069dc00002100ff0085be00100000020500d7050002000403027e300069dc00012100040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00200000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00300000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00400000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00500000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00600000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00700000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00800000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00900000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00a00000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00b00000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00c00000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00d00000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00e00000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00f00000020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00000100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00100100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00200100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00300100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00400100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00500100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00600100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00700100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00800100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00900100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00a00100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00b00100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00c00100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00d00100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00e00100020500d705000200040520d5040216000403067e03020e7e02020c7e01020a7e0002087e000076dc02047c00ff0085be00f00100000500d705000200020420d5040216000203027e03020a7e0202087e0102067e0002047e000076dc00027c00c122a2be040051dc000021002200febef70389bf000060d7000101007e007e8c080055dc00002101800080bef70389bf01004ad4010100007e0080be000061d7000c0100c122a2be040069dc000021002200febe0001008b0000febe2600a5bf200055dc000021022c0051dc00002104180055dc00002105f70389bf0503007eff0080be00a00100000100d7000002000603027e800080be050020d5000206000503027e00086edc00027c00800080be80020c7e06030a7e950080be05003cd7000802000503087e02030e7e06030a7e03030c7e040000d7040f0200060020d5050d020006030a7e08086edc00047c0010086edc00027c00c122a2be040051dc000021002200febef70389bf000060d7000d01007e007e8c1f0060d7280301001e0060d728010100220060d728050100000060d728070100c12481be380051dc00002100c100febe000051dc000021280100febe20ff2081c0ffffff0000a1bef70389bf1e4880be000089bfc12480be240069dc00002000280069dc000120002c0069dc000220000000febe040069dc001f2000000069dc00002000c12283be080051dc000020000300febe000051dc00002001800080bef70389bf000044d4010100000102027e0c0069dc000120007e0081be0100008b0001018d000061d701000100c12283be080069dc000020000300febe0000febe3100a5bfc12283be080051dc000020000300febe000051dc00002001810080bef70389bf000044d4010100000102027e100069dc000120007e0081be0100008b0001018d000061d701020100c12283be080069dc000020000300febe0000febe5000a5bfc12283be080051dc000020000300febe000051dc00002001820080bef70389bf01004ad401010000800080be8002027e140069dc000120007e0080be000061d700040100c12283be080069dc000020000300febe0001008b0000febe5d00a5bf5500a0bfc12283be080051dc000020000300febef70389bf000060d700010100002280be0c0051dc00002001f70389bf180069dc000120007e00008b000061d700060100c12283be080069dc000020000300febe7e007e8d6f00a5bfc12283be080051dc000020000300febe000051dc00002001800080bef70389bf01004ad4010100000002027e1c0069dc000120007e0080be000061d700080100c12283be080069dc000020000300febe0001008b0000febe4b00a5bf040051dc00002000ff0080beff030000f70389bf00001bd5000100001c0069dc000020004100a0bfc12283be080051dc000020000300febef70389bf000060d700030100002280be100051dc00002001f70389bf200069dc000120007e00008b000061d7000a0100c12283be080069dc000020000300febe7e007e8d1d00a5bf040051dc00002000f70389bf000010d600152902200069dc000020001500a0bf040051dc00002000f70389bf000010d600292902140069dc00002000c12283be080051dc000020010300febef70389bf000060d7010501007e007e8c140051dc00002000f70389bf100069dc00002000cdffa0bfc12283be080051dc000020010300febef70389bf000060d7010b01007e007e8c200051dc00002000f70389bf0c0069dc0000200088ffa0bfc12283be080051dc000020010300febef70389bf000060d7010901007e007e8c1c0051dc00002000f70389bf180069dc00002000c12283be080051dc000020020300febef70389bf000060d7020701007e007e8c180051dc000020008002027ec12480bef70389bf240051dc00002000280051dc000020012c0051dc000020020000febef70389bf1e4880be000089bf210080be2000a1bec12481bec40169dc00022100c80169dc00002100cc0169dc00052100d00169dc00032100d40169dc00042100d80169dc00012100dc0169dc00082100e00169dc00072100e40169dc00062100c100febe000069dc002821000100febe280061d722040100280061d70006010020ff2081f0010000280061d71e000100280061d71f020100100069dc001f2100020061d706000100020061d707020100c122a2be040069dc000221002200febe0103067e0003027ec122a2be040051dc000021002200febef70389bf000061d70f040100000061d70e060100000061d70d080100000061d70c0a0100000061d70a0c0100000061d70b0e0100000061d708100100000061d709120100000061d704140100000061d7051601000303047e800180be01005dd401010000c10080be010001d5000206000c0069dc000121008002047e004780be00ff0080d4c1ffff01ff0182ffffffff000042dc02000002800080bef70389bf01003dd402010000000061d70118010002003ad402010000000061d7021a0100ff0081be00020000ff0080be00010000027e028b00010098010025d501010000080069dc00012100004780be00ff00806cc1ffff01ff0182ffffffff800000f4000000f8004780be00ff0080786f000001ff018200000000ff0083bef401000007fc89bf020304bf000061d7001c0100000061d7011e01007e00a2bec100febe040069dc000021002200febe1400a2bfc122a2be040051dc000021002200febef70389bf000060d700110100010060d700130100000004f4600000f807fc89bf000061d7001c0100000061d7011e0100c122a2be040069dc000021002200febec122a2be040051dc000021002200febef70389bf030060d7001b0100000060d7001d0100010060d7001f0100080051dc000021010c0051dc00002103ff02047e00a00100000061d700200100000061d701220100100856dc02000004f70389bf2c006ddc00042100080856dc02000004f70389bf24006ddc00042100800080bec10081be02001fd701000000020020d701040200200069dc0002210001004ad402010000000061d701240100c00082bea00081be037e048b01020198000061d701260100ff0082bec0ffffffff0081bee0ffffff037e038b01020198000061d701280100c40081be040025d5030300001c0069dc00042100820081be020018d501040200030025d503050200180069dc00032100010025d501050200140069dc00012100000061d7002a0100c122a2be040069dc000021002200febe3000a0bf1f0060d7280301001e0060d728010100c122a2be340051dc000021002200febec122a2be380051dc000021012200febec122a2be040051dc000021022200febe220060d728050100000060d728070100c12481bec40151dc00002102c80151dc00002100cc0151dc00002105d00151dc00002103d40151dc00002104d80151dc00002101dc0151dc00002108e00151dc00002107e40151dc00002106c100febe000051dc000021280100febe20ff208110feffff0000a1be070089bf1e4880bec122a2be040051dc000021002200febef70389bf010060d700250100000060d7002b0100000061d7002c0100800080be8002027e3c0069dc000121007e0080be000061d7002e0100c122a2be040069dc000021002200febe0001008b0000febe1c00a5bfc122a2be040051dc000021002200febef70389bf020060d700210100030060d700230100000060d7002d0100800084be040081be870084be00048484020080be030081be040083be050082be0003008001020282020081be8002007e004852dc00000000f70389bf3c0069dc00002100c122a2be040051dc000021002200febef70389bf000060d7002f01007e007e8c3c0051dc00002101f70389bf0105007e000061d700300100c122a2be040069dc000021002200febe800081be000106bf2b00a2bfc122a2be040051dc000021052200febef70389bf020060d7052d0100010060d705290100000060d705310100c10083be000300810001018b800080be000083be800080be050061d702320100050061d7033401000002087e0002067e0102047e0002027e0002007e050061d700360100c122a2be040069dc000521002200febe500069dc000421004c0069dc00032100480069dc00022100440069dc00012100400069dc000021000100a0bf080da0bfc122a2be040051dc000021032200febef70389bf010060d703270100000060d703310100020060d703370100c122a2be380051dc000021002200febe400051dc00002101500051dc000021024c0051dc00002104480051dc00002105440051dc00002106f70389bf6c0069dc00062100680069dc00052100640069dc00042100030061d702380100000049d401010000030061d7003a0100010049d4020300000100018b800082be030061d7023c01000103087e600069dc000421000203087e5c0069dc00042100030061d7003e0100c122a2be040069dc000321002200febe580069dc00022100540069dc000121007e0080be000061d700000100c122a2be380069dc000021002200febe0001008b0000febee102a5bfc122a2be040051dc000021042200febec122a2be380051dc000021002200febef70789bf000060d704310100f70389bf010060d700030100020060d7043d0100200051dc00002102600051dc000021015c0051dc00002103f70389bf780069dc00032100740069dc00012100000061d702040100000061d701060100010025d501050200700069dc00012100010049d401010000800080be000061d7000801007e0080be000061d7000a0100c122a2be380069dc000021002200febe0001008b0000febedb00a5bfc122a2be380051dc000021002200febe700051dc00002101ff0080be00010000f70389bf000049d4010100000202027e0302047e7c006ddc000121007e0081be0100008b0001018d000061d7010c0100c122a2be380069dc000021002200febe0000febe0100a5bf7200a0bfc122a2be380051dc000021002200febef70389bf000060d7000d0100002280be7c0055dc00002101f70389bf84006ddc000121007e00008b000061d7000e0100c122a2be380069dc000021002200febe7e007e8dbb00a5bfc122a2be040051dc000021012200febef70389bf040060d701210100050060d701230100080060d701330100090060d701350100700051dc00002100ff0080be00fffffff70389bf010025d500010000880080be010019d500020200080080beff0081be0018000000018296a00086be08068885080083be0301039602030281070083be0206868400010296800080be000083be0206868c040083be050081be060084be070082be0304048001020182010085be980081be03002cd7010300000002027e0103087e040083be0303047e050082be0403027e020300d703040200010220d502020e000103067e0203027eff0082be00200000010300d7020202000303047e800082be030220d502040e000303047e004856dc01007c01ff0082beff00000000001bd50005000004002cd7000300000002007e00030a7ef70389bf0103007e0403067e0203027e0503047e000000d700070200020020d5010502000203027e84006ddc000021005f00a0bfc122a2be040051dc000021012200febef70389bf020060d701210100030060d701230100080060d701330100090060d701350100700051dc00002100980080bef70389bf0100fed6000100020103087e800086be0602007e00030a7e0503007e0203027e0002067e0303047ea00087be02003cd7070202000303027e00001cd5000302000403027e01001cd5010502000003047e080080beff0081be001800000001849608078885080085be0501059604050481080085be0407848400010096060081be0004848c020080be030081be040083be050082be0003008001020282020081beff0184be00280000000082be010080be040083be050081be0203028000010082000083be020081be0103007e030080be0203027e000100d701000200020020d5000206000203027e7c006ddc0000210040ffa0bfc122a2be380051dc000021002200febef70389bf010060d7000b01007e017e8c000060d700090100000061d700100100c122a2be380069dc000021002200febede00a0bfc122a2be380051dc000021002200febef70389bf000060d7000f01007e007e8c240055dc000021032c0055dc00002105840055dc00002101900182bef70389bf01030e7e020081be0203107e030080be070100d707030000090020d5080104000903107e9c006ddc00072100104052dc01007c07880182be0103107e020081be0203127e030080be080100d7080300000a0020d5090104000a03127e94006ddc00082100084056dc01007c01f70389bf8c006ddc00012100800080be00004ad407010000800182be01005dd4010500000001008b010059d4010b020002005ed4010702000102018c0001018b800080be000061d7001201007e0080be000061d700140100c122a2be380069dc000021002200febe0001008b0000febe8400a5bfc122a2be040051dc000021002200febef70389bf0f0060d7000501000e0060d7000701000d0060d7000901000c0060d7000b01000a0060d7000d01000b0060d7000f0100080060d700110100090060d700130100060060d700010100070060d700030100040060d700150100050060d700170100100051dc0000211f8c0055dc00002102a00080bef70389bf00003dd7000402000003027e0203007e004780be00ff008048c6ffff01ff0182ffffffff8002067ea40069dc000321000303047e00499ebec122a2be040051dc000021052200febe940055dc00002103a40051dc00002102100051dc0000211ff70f89bf040060d705150100050060d705170100060060d705010100070060d705030100080060d705110100090060d7051301000a0060d7050d01000b0060d7050f01000c0060d7050b01000d0060d7050901000e0060d7050701000f0060d7050501009c0055dc0000210080020a7e80020c7ef70f89bf00006edc03057c00f70389bf00006adc00027c00004780be00ff0080442c000001ff01820000000000499ebe780051dc000021040c0051dc00002101700051dc000021020003067ec122a2be380051dc000021002200febe820080be030018d500060200f70f89bf040018d500080200f70b89bf010055d601071204f70789bf000034d801020000c10080be7e0080bef70389bf000061d700120100c122a2be380069dc000021002200febec122a2be380051dc000021002200febef70389bf010060d7001501007e017e8c000060d700130100007e008b000061d700080100c122a2be380069dc000021002200febe11ffa0bfc122a2be040051dc000021012200febef70389bf010060d701190100c122a2be380051dc000021002200febec10080be010001d580020500810081be01004dd4010300007e016a8bf70389bf000061d700160100c122a2be380069dc000021002200febe1900a4bfc122a2be380051dc000021002200febef70389bf000060d700110100010001d580020100800080be02004dd401010000800080be000083be021982be000083be020081be000061d701180100000061d700160100c122a2be380069dc000021002200febec122a2be380051dc000021002200febef70389bf010060d700170100000060d700190100000001d580020500810081be01004dd4000300007e016a8b0002007ea80069dc000021003000a4bfc122a2be040051dc000021012200febec122a2be380051dc000021002200febef70789bf0f0060d7010501000e0060d7010701000d0060d7010901000c0060d7010b01000a0060d7010d01000b0060d7010f0100080060d701110100090060d701130100060060d701010100070060d701030100040060d701150100050060d701170100f70389bf000060d700110100100051dc0000211f000001d580020100004780be00ff0080242b000001ff01820000000000499ebea80069dc00002100c122a2be040051dc000021032200febec122a2be380051dc000021002200febef70389bf010060d700070100030060d700050100040060d703270100000060d703310100740051dc00002101780051dc00002104a80051dc00002102f70389bf020025d502090200010025d501090000020049d40101000000004ed40101000004004ed4020900000004008c7e00008b0003008c017e0191027e028b0102018c000061d7011a0100000061d701020100000081be030061d7013c0100c122a2be040069dc000321002200febe0103067e600069dc000321000203067e5c0069dc00032100b00069dc00022100ac0069dc00012100000081be000061d7011c0100c122a2be380069dc000021002200febe7e007e9148fda6bfc122a2be380051dc000021002200febef70389bf000060d7001d01007e007e8cc122a2be380051dc000021032200febec122a2be040051dc000021022200febef70789bf010060d7031b0100f70389bf000060d7023b0100b00051dc00002101ac0051dc00002100007e0091017e018b0001008c020061d7003e0100c122a2be040069dc000221002200febef70789bf580069dc00012100f70389bf540069dc00002100c122a2be040051dc000021042200febec122a2be380051dc000021002200febef70389bf010060d7000101007e017e8c000060d7043f0100640051dc00002101580051dc00002102540051dc00002103f70389bfd00069dc00032100cc0069dc00022100000061d7001e0100800080be01004dd402010000c10080be02020c7e02020a7e0202087e0202067e0202047e000061d700200100c80069dc00062100c40069dc00052100c00069dc00042100bc0069dc00032100b80069dc00022100b40069dc000121007e0080be000061d700220100c122a2be380069dc000021002200febe0001008b0000febe5b03a5bfc122a2be040051dc000021032200febef70389bf010060d703270100000060d703310100c122a2be380051dc000021002200febe6c0051dc00002101680051dc00002102f70389bf000049d402010000010049d4010300000001018b800080be000061d7002401000103067ee00069dc000321000203067edc0069dc00032100d80069dc00022100d40069dc000121007e0080be000061d700260100c122a2be380069dc000021002200febe0001008b0000febe6f03a5bfc122a2be040051dc000021042200febec122a2be380051dc000021002200febef70789bf000060d704310100f70389bf010060d700250100200051dc00002102dc0051dc00002101e00051dc00002103f70389bfec0069dc00032100000061d701280100e80069dc00012100010025d501050200e40069dc00012100010049d401010000800080be000061d7002a01007e0080be000061d7002c0100c122a2be380069dc000021002200febe0001008b0000febedb00a5bfc122a2be380051dc000021002200febee40051dc00002101ff0080be00010000f70389bf000049d4010100000202027e0302047ef0006ddc000121007e0081be0100008b0001018d000061d7012e0100c122a2be380069dc000021002200febe0000febe0100a5bf7200a0bfc122a2be380051dc000021002200febef70389bf000060d7002f0100002280bef00055dc00002101f70389bff8006ddc000121007e00008b000061d700300100c122a2be380069dc000021002200febe7e007e8dbb00a5bfc122a2be040051dc000021012200febef70389bf040060d701210100050060d701230100080060d701330100090060d701350100e40051dc00002100ff0080be00fffffff70389bf010025d500010000880080be010019d500020200080080beff0081be0018000000018296a00086be08068885080083be0301039602030281070083be0206868400010296800080be000083be0206868c040083be050081be060084be070082be0304048001020182010085be980081be03002cd7010300000002027e0103087e040083be0303047e050082be0403027e020300d703040200010220d502020e000103067e0203027eff0082be00200000010300d7020202000303047e800082be030220d502040e000303047e004856dc01007c01ff0082beff00000000001bd50005000004002cd7000300000002007e00030a7ef70389bf0103007e0403067e0203027e0503047e000000d700070200020020d5010502000203027ef8006ddc000021005f00a0bfc122a2be040051dc000021012200febef70389bf020060d701210100030060d701230100080060d701330100090060d701350100e40051dc00002100980080bef70389bf0100fed6000100020103087e800086be0602007e00030a7e0503007e0203027e0002067e0303047ea00087be02003cd7070202000303027e00001cd5000302000403027e01001cd5010502000003047e080080beff0081be001800000001849608078885080085be0501059604050481080085be0407848400010096060081be0004848c020080be030081be040083be050082be0003008001020282020081beff0184be00280000000082be010080be040083be050081be0203028000010082000083be020081be0103007e030080be0203027e000100d701000200020020d5000206000203027ef0006ddc0000210040ffa0bfc122a2be380051dc000021002200febef70389bf010060d7002d01007e017e8c000060d7002b0100000061d700320100c122a2be380069dc000021002200febe1a01a0bfc122a2be380051dc000021002200febef70389bf000060d7003101007e007e8c240055dc000021032c0055dc00002105f80055dc00002107880182bef70389bf0703027e020081be0803047e030080be010100d701030000090020d5020104000903047e10016ddc00012100084056dc07007c01f70389bf08016ddc00012100900182be0703127e020081be0803147e030080be090100d7090300000b0020d50a0104000b03147e00016ddc00092100104052dc07007c07800080bef70389bf00004ad407010000010059d4010b020002005ed4010702000102018c0001008b7e0081be0100008b0001018d000061d701340100c122a2be380069dc000021002200febe0000febe0100a5bf6100a0bfc122a2be380051dc000021002200febef70389bf000060d700350100002280be010060d700370100000061d701380100000061d7013a01007e00008b000061d7003c0100c122a2be380069dc000021002200febe7e007e8dad00a5bfc122a2be040051dc000021002200febef70389bf0f0060d7000501000e0060d7000701000d0060d7000901000c0060d7000b01000a0060d7000d01000b0060d7000f0100080060d700110100090060d700130100060060d700010100070060d700030100040060d700150100050060d700170100100051dc0000211f004780be00ff0080d81f000001ff01820000000000499ebeec0051dc00002104080051dc00002101e40051dc00002102c122a2be380051dc000021032200febef70389bf000060d7033901000003067ec122a2be380051dc000021002200febe820081be030018d501060200040018d501080200010055d601071204000034d801020000c10081be007e008cf70389bf000061d7003a0100c122a2be380069dc000021002200febe6300a0bfc122a2be380051dc000021002200febe080155dc00002101800180bef70389bf01005dd4010100007e0080be000061d7003e0100c122a2be380069dc000021002200febe0001008b0000febe3f00a5bfc122a2be040051dc000021002200febef70389bf0f0060d7000501000e0060d7000701000d0060d7000901000c0060d7000b01000a0060d7000d01000b0060d7000f0100080060d700110100090060d700130100060060d700010100070060d700030100040060d700150100050060d700170100100051dc0000211f080155dc00002102a00080bef70389bf00003dd7000402000003027e0203007e004780be00ff008068b7ffff01ff0182ffffffff8002067e180169dc000321000303047e00499ebe100155dc00002103180151dc00002102000155dc0000210080020a7e80020c7ef70b89bf00006edc03057c00f70389bf00006adc00027c00c122a2be380051dc000021002200febef70389bf000060d7003f01007e007e8c800080be000061d700360100c122a2be380069dc000021002200febe3cffa0bfc122a2be380051dc000021002200febef70389bf010060d7003d01007e017e8c000060d7003b0100007e008b000061d7002a0100c122a2be380069dc000021002200febed5fea0bfc122a2be040051dc000021012200febef70389bf010060d701190100c122a2be340051dc000021002200febec10080be010001d580020500810081be01004dd4010300007e016a8bf70389bf000061d700000100c122a2be340069dc000021002200febe1e00a4bfc122a2be380051dc000021012200febef70389bf000060d701330100c122a2be340051dc000021002200febe010001d580020100800080be02004dd401010000800080be000083be021982be000083be020081bef70389bf000061d701020100000061d700000100c122a2be340069dc000021002200febec122a2be340051dc000021002200febef70389bf010060d700010100000060d700030100000001d580020500810081be01004dd4000300007e016a8b0002007e1c0169dc000021003000a4bfc122a2be040051dc000021012200febec122a2be380051dc000021002200febef70789bf0f0060d7010501000e0060d7010701000d0060d7010901000c0060d7010b01000a0060d7010d01000b0060d7010f0100080060d701110100090060d701130100060060d701010100070060d701030100040060d701150100050060d701170100f70389bf000060d700330100100051dc0000211f000001d580020100004780be00ff0080041d000001ff01820000000000499ebe1c0169dc00002100c122a2be040051dc000021052200febef70389bf020060d7052701000f0060d7050501000e0060d7050701000d0060d7050901000c0060d7050b01000a0060d7050d01000b0060d7050f0100080060d705110100090060d705130100060060d705010100070060d705030100040060d705150100050060d705170100ec0051dc000021041c0151dc00002102200051dc00002101100051dc0000211f080051dc00002100820080bef70389bf000046d604010004004780be00ff00805c1c000001ff018200000000280169dc0002210000499ebe280151dc00002102e80051dc00002101c122a2be380051dc000021032200febec122a2be340051dc000021002200febe000060d705310100f70789bf010060d703290100020025d502090200010026d50105000000004ed40101000002004ed4020500000002008c7e00008b0001008c000081be030061d701240100c122a2be380069dc000321002200febe0203067ee00069dc000321000103067edc0069dc00032100240169dc00022100200169dc00012100000081bef70389bf000061d701040100c122a2be340069dc000021002200febe7e007e91e5fca6bfc122a2be340051dc000021002200febef70389bf000060d7000501007e007e8c240151dc00002100200151dc00002101f70389bfd80069dc00012100d40069dc000021004200a0bfc122a2be380051dc000021082200febec122a2be040051dc000021072200febef70789bf020060d7082301007e027e8cf70389bf010060d707390100000060d708210100c122a2be340051dc000021002200febec80051dc00002102c40051dc00002103c00051dc00002104bc0051dc00002105b80051dc00002106b40051dc000021017e00008b0001008c000081be070061d701360100c122a2be040069dc000721002200febef70789bf500069dc000621004c0069dc00052100480069dc00042100440069dc00032100400069dc00022100f70389bf2c0169dc00012100000081be000061d701060100c122a2be340069dc000021002200febe7e007e9101f9a6bfd303a0bfc122a2be380051dc000021042200febef70389bf000060d7042701007e007e8cc122a2be340051dc000021002200febe640051dc00002101d80051dc00002103d40051dc00002102f70389bf4c0169dc00022100480169dc00032100800080be01004dd402010000c10080be02020c7e02020a7e0202087e0202067e0202047e000061d700080100440169dc00062100400169dc000521003c0169dc00042100380169dc00032100340169dc00022100300169dc000121007e0080be000061d7000a0100c122a2be340069dc000021002200febe0001008b0000febe3f00a5bfc122a2be040051dc000021042200febef70389bf000060d704270100c122a2be340051dc000021002200febe200051dc00002102cc0051dc000021014c0151dc00002103f70389bf010013d501070200010013d500020200500169dc0001210001004cd401050200800080be000061d7000c01007e0080be000061d7000e0100c122a2be340069dc000021002200febe0001008b0000febe4d00a5bfc122a2be340051dc000021002200febe140051dc00002102180051dc00002101f70389bf0000d8d8010000010000d8d80200000207fc89bf000049d401050200007e008b000061d7000c0100c122a2be340069dc000021002200febe3500a0bfc122a2be340051dc000021072200febef70389bf010060d7070b01007e017e8c000060d707090100c122a2be380051dc000021062200febe440151dc00002105400151dc000021043c0151dc00002103380151dc00002102340151dc00002101300151dc00002100007e0092f71b89bf060061d700200100c122a2be380069dc000621002200febef71789bfc80069dc00052100f71389bfc40069dc00042100f70f89bfc00069dc00032100f70b89bfbc0069dc00022100f70789bfb80069dc00012100f70389bfb40069dc0000210012ffa0bfc122a2be340051dc000021002200febef70389bf000060d7000f01007e007e8c010060d7000d0100000061d7011001007e0080be000061d700120100c122a2be340069dc000021002200febe0001008b0000febee000a5bfc122a2be340051dc000021002200febe180051dc00002101f70389bf0000d8d80100000107fc89bf5c0169dc00012100ff0080be00010000000049d4010100000202027e0302047e54016ddc000121007e0081be0100008b0001018d000061d701140100c122a2be340069dc000021002200febe0000febe0100a5bf7200a0bfc122a2be340051dc000021002200febef70389bf000060d700150100002280be540155dc00002101f70389bf60016ddc000121007e00008b000061d700160100c122a2be340069dc000021002200febe7e007e8db300a5bfc122a2be040051dc000021012200febef70389bf040060d701210100050060d701230100080060d701330100090060d7013501005c0151dc00002100ff0080be00fffffff70389bf010025d500010000880080be010019d500020200080080beff0081be0018000000018296a00086be08068885080083be0301039602030281070083be0206868400010296800080be000083be0206868c040083be050081be060084be070082be0304048001020182010085be980081be03002cd7010300000002027e0103087e040083be0303047e050082be0403027e020300d703040200010220d502020e000103067e0203027eff0082be00200000010300d7020202000303047e800082be030220d502040e000303047e004856dc01007c01ff0082beff00000000001bd50005000004002cd7000300000002007e00030a7ef70389bf0103007e0403067e0203027e0503047e000000d700070200020020d5010502000203027e60016ddc000021005700a0bfc122a2be040051dc000021012200febef70389bf020060d701210100030060d701230100080060d701330100090060d7013501005c0151dc00002100980080bef70389bf0100fed6000100020103087e800086be0602007e00030a7e0503007e0203027e0002067e0303047ea00087be02003cd7070202000303027e00001cd5000302000403027e01001cd5010502000003047e080080beff0081be001800000001849608078885080085be0501059604050481080085be0407848400010096060081be0004848c020080be030081be040083be050082be0003008001020282020081beff0184be00280000000082be010080be040083be050081be0203028000010082000083be020081be0103007e030080be0203027e000100d701000200020020d5000206000203027e54016ddc0000210040ffa0bfc122a2be340051dc000021002200febef70389bf000060d7001301007e007e8c0b01a0bfc122a2be340051dc000021002200febef70389bf000060d7001701007e007e8c140051dc00002101600155dc00002102f70389bf74016ddc000221000000d8d80100000107fc89bf700169dc00012100ff0080be00010000000049d4010100000202027e0302047e68016ddc000121007e0081be0100008b0001018d000061d701180100c122a2be340069dc000021002200febe0000febe0100a5bf7200a0bfc122a2be340051dc000021002200febef70389bf000060d700190100002280be680155dc00002101f70389bf7c016ddc000121007e00008b000061d7001a0100c122a2be340069dc000021002200febe7e007e8daa00a5bfc122a2be040051dc000021012200febef70389bf040060d701210100050060d701230100080060d701330100090060d701350100700151dc00002100ff0080be00fffffff70389bf010025d500010000880080be010019d500020200080080beff0081be0018000000018296a00086be08068885080083be0301039602030281070083be0206868400010296800080be000083be0206868c040083be050081be060084be070082be0304048001020182010085be980081be03002cd7010300000002027e0103087e040083be0303047e050082be0403027e020300d703040200010220d502020e000103067e0203027eff0082be00200000010300d7020202000303047e800082be030220d502040e000303047e004856dc01007c01ff0082beff00000000001bd50005000004002cd7000300000002007e00030a7ef70389bf0103007e0403067e0203027e0503047e000000d700070200020020d5010502000203027e7c016ddc000021004e00a0bfc122a2be040051dc000021012200febef70389bf020060d701210100030060d701230100080060d701330100090060d701350100700151dc00002100980080bef70389bf0100fed6000100020103087e800086be0602007e00030a7e0503007e0203027e0002067e0303047ea00087be02003cd7070202000303027e00001cd5000302000403027e01001cd5010502000003047e080080beff0081be001800000001849608078885080085be0501059604050481080085be0407848400010096060081be0004848c020080be030081be040083be050082be0003008001020282020081beff0184be00280000000082be010080be040083be050081be0203028000010082000083be020081be0103007e030080be0203027e000100d701000200020020d5000206000203027e68016ddc0000210040ffa0bfc122a2be340051dc000021042200febef70389bf000060d7041b01007e007e8c740155dc000021025c0151dc000021067c0155dc00002100f70389bf084056dc00007c04f70389bf04006adc04067c0008006edc02047c00104052dc00007c04f70389bf10006adc02047c008002047e8002067e08006edc00027c008002047e10006adc00027c00ecfea0bfc122a2be040051dc000021012200febef70389bf010060d701190100c122a2be340051dc000021002200febec10080be010001d580020500810081be01004dd4010300007e016a8bf70389bf000061d7001c0100c122a2be340069dc000021002200febe1900a4bfc122a2be340051dc000021002200febef70389bf000060d700110100010001d580020100800080be02004dd401010000800080be000083be021982be000083be020081be000061d7011e0100000061d7001c0100c122a2be340069dc000021002200febec122a2be340051dc000021002200febef70389bf010060d7001d0100000060d7001f0100000001d580020500810081be01004dd4000300007e016a8b0002007e840169dc000021003000a4bfc122a2be040051dc000021012200febec122a2be340051dc000021002200febef70789bf0f0060d7010501000e0060d7010701000d0060d7010901000c0060d7010b01000a0060d7010d01000b0060d7010f0100080060d701110100090060d701130100060060d701010100070060d701030100040060d701150100050060d701170100f70389bf000060d700110100100051dc0000211f000001d580020100004780be00ff0080540d000001ff01820000000000499ebe840169dc00002100c122a2be340051dc000021002200febe640051dc00002101840151dc00002102800080bef70389bf8c0169dc0002210001004dd402010000880169dc000121007e0080be000061d700200100c122a2be340069dc000021002200febe0001008b0000febe0d00a5bf8c0151dc000021001c0051dc00002101820080bef70389bf000046d6000104040000d8d80000000007fc89bf880169dc00002100c122a2be380051dc000021062200febec122a2be340051dc000021002200febef70389bf010060d7002101007e017e8c000060d7061f01004c0151dc00002101cc0051dc000021028c0151dc00002103500151dc00002104880151dc00002105f70389bf980169dc0005210001004ad4030902000001018bc10080be0100008d000061d700220100940169dc00022100900169dc000121007e0080be000061d700240100c122a2be340069dc000021002200febe0001008b0000febe2500a5bf140051dc00002102180051dc000021044c0151dc00002100500151dc00002106cc0051dc00002101820080bef70789bf030018d5000c0200050025d504070200030025d502070200f70389bf010026d5010d0200000026d5000d02000000d8d80500000507fc89bf000034d8040500000000d8d80300000307fc89bf000034d802030000940169dc00012100900169dc00002100c122a2be340051dc000021052200febef70389bf010060d7052501007e017e8c000060d705230100980151dc00002100480151dc00002102d00051dc00002104940151dc00002101900151dc00002103007e0092050061d700080100c122a2be340069dc000521002200febef70b89bf440169dc00042100f70389bf400169dc000321003c0169dc000221000003047e380169dc00022100340169dc00012100300169dc00002100a4fca0bfc122a2be340051dc000021002200febef70389bf000060d7000701007e007e8cc122a2be040051dc000021022200febef70389bf000060d702290100c122a2be340051dc000021012200febe2c0151dc00002100f70389bf00001bd500010000800080be010061d700260100c122a2be340069dc000121002200febe9c0169dc00002100c122a2be040051dc000021032200febec122a2be340051dc000021002200febef70789bf000060d703310100f70389bf010060d700270100000061d701280100200051dc000021029c0151dc00002101f70389bfa40169dc00012100010025d501050200a00169dc00012100010049d401010000800080be000061d7002a01007e0080be000061d7002c0100c122a2be340069dc000021002200febe0001008b0000febedb00a5bfc122a2be340051dc000021002200febea00151dc00002101ff0080be00010000f70389bf000049d4010100000202027e0302047ea8016ddc000121007e0081be0100008b0001018d000061d7012e0100c122a2be340069dc000021002200febe0000febe0100a5bf7200a0bfc122a2be340051dc000021002200febef70389bf000060d7002f0100002280bea80155dc00002101f70389bfb0016ddc000121007e00008b000061d700300100c122a2be340069dc000021002200febe7e007e8dbb00a5bfc122a2be040051dc000021012200febef70389bf040060d701210100050060d701230100080060d701330100090060d701350100a00151dc00002100ff0080be00fffffff70389bf010025d500010000880080be010019d500020200080080beff0081be0018000000018296a00086be08068885080083be0301039602030281070083be0206868400010296800080be000083be0206868c040083be050081be060084be070082be0304048001020182010085be980081be03002cd7010300000002027e0103087e040083be0303047e050082be0403027e020300d703040200010220d502020e000103067e0203027eff0082be00200000010300d7020202000303047e800082be030220d502040e000303047e004856dc01007c01ff0082beff00000000001bd50005000004002cd7000300000002007e00030a7ef70389bf0103007e0403067e0203027e0503047e000000d700070200020020d5010502000203027eb0016ddc000021005f00a0bfc122a2be040051dc000021012200febef70389bf020060d701210100030060d701230100080060d701330100090060d701350100a00151dc00002100980080bef70389bf0100fed6000100020103087e800086be0602007e00030a7e0503007e0203027e0002067e0303047ea00087be02003cd7070202000303027e00001cd5000302000403027e01001cd5010502000003047e080080beff0081be001800000001849608078885080085be0501059604050481080085be0407848400010096060081be0004848c020080be030081be040083be050082be0003008001020282020081beff0184be00280000000082be010080be040083be050081be0203028000010082000083be020081be0103007e030080be0203027e000100d701000200020020d5000206000203027ea8016ddc0000210040ffa0bfc122a2be340051dc000021002200febef70389bf010060d7002d01007e017e8c000060d7002b0100000061d700320100c122a2be340069dc000021002200febe1900a0bfc122a2be340051dc000021002200febef70389bf000060d7003101007e007e8cb00155dc00002101f70389bf084056dc01007c01800180bef70389bf00005dd401010000007e008b000061d7002a0100c122a2be340069dc000021002200febed6ffa0bfc122a2be040051dc000021012200febef70389bf010060d701190100c122a2be340051dc000021002200febec10080be010001d580020500810081be01004dd4010300007e016a8bf70389bf000061d700340100c122a2be340069dc000021002200febe1900a4bfc122a2be340051dc000021002200febef70389bf000060d700330100010001d580020100800080be02004dd401010000800080be000083be021982be000083be020081be000061d701360100000061d700340100c122a2be340069dc000021002200febec122a2be340051dc000021002200febef70389bf010060d700350100000060d700370100000001d580020500810081be01004dd4000300007e016a8b0002007eb80169dc000021003000a4bfc122a2be040051dc000021012200febec122a2be340051dc000021002200febef70789bf0f0060d7010501000e0060d7010701000d0060d7010901000c0060d7010b01000a0060d7010d01000b0060d7010f0100080060d701110100090060d701130100060060d701010100070060d701030100040060d701150100050060d701170100f70389bf000060d700330100100051dc0000211f000001d580020100004780be00ff0080a003000001ff01820000000000499ebeb80169dc00002100c122a2be040051dc000021032200febec122a2be340051dc000021002200febef70389bf010060d700290100000060d703270100a40151dc00002101b80151dc00002102f70389bf010025d50203020000004dd4000402000001008c000081be000061d7012601000103047e9c0169dc00022100bc0169dc00012100000081be000061d701380100c122a2be340069dc000021002200febe7e007e9133fea6bfc122a2be340051dc000021002200febef70389bf000060d7003901007e007e8cc122a2be040051dc000021022200febef70389bf010060d702250100c122a2be340051dc000021002200febebc0151dc00002101f70389bfc00169dc000121007e0080be000061d7003a0100c122a2be340069dc000021002200febe0001008b0000febe1c00a5bfc122a2be040051dc000021002200febef70389bf020060d700210100030060d700230100000060d700330100010060d700350100c00151dc00002101870084be00048484020080be030081be040083be050082be0003008001020282020081be8002007ef70389bf00086adc00010000c122a2be340051dc000021002200febef70389bf000060d7003b01007e007e8cf7f2a0bfc122a2be040051dc000021002200febef70389bf000060d7002d0100810081be00010081900081be000106bf000061d7002a01007e00a2bec100febe040069dc000021002200febe3ff2a2bf6ef2a0bf000089bfc12480be0c0069dc00002000100069dc000120000000febe8002027e004780be00ff00804888ffff01ff0182ffffffff000042dc01000001800080bef70389bf02003ad401010000c10080be7e026a8b0102027e040069dc00012000000061d700000100c12284be000069dc000020000400febe1700a4bfc12284be000051dc000020000400febe7e0180bea00082be000280857e0081be800082be01001fd701040000010020d700020200800080be040069dc00012000f70389bf000061d700000100c12284be000069dc000020000400febec12284be000051dc000020010400febef70389bf000060d701010100040051dc00002000010001d580020100810080be00004dd4010100007e006a8bf70389bf080069dc000020000600a4bf7e0080be800081be00001fd700020000080069dc00002000c12284be000051dc000020010400febe080051dc00002000c12480bef70389bf0c0051dc00002000100051dc000020010000febef70389bf1e4880be000089bf01001bd58100020000004ad401030100800080be00004dd400010000001880be0002007e1e4880be000089bfc12480be100069dc000020000000febe0c0069dc00022000080069dc000120000003067ec12283be000051dc000020000300febe040069dc00032000010049d4010502007e0080bef70389bf000061d700000100c12283be000069dc000020000300febe0001008b0000febe1500a5bf0c0051dc00002002080051dc00002001040051dc00002000820080bef70389bf000046d601010004016f027e010047d6010502000000d8d80000000207fc89bf0000ccda0102000107fc89bf000034d800010000c12283be000051dc000020000300febef70389bf000060d7000101007e007e8cc12480be100051dc000020000000febe070089bf1e4880be000089bf0f03207e0e031e7e0d031c7e0c031a7e0b03187e0a03167e0903147e0803127e0703107e06030e7e05030c7e04030a7e0303087e0203067e0103047e0003027e004780be00ff00803897ffff01ff0182ffffffff8402007e004880be000080bf000080bf000080bf000080bf000080bf000080bf000080bf000080bf000080bfff00a0bec0000000010061d70f000100010061d70e020100010061d70d040100010061d704060100010061d705080100010061d7020a0100010061d7030c0100010061d7000e0100010061d701100100c12292be000069dc00017c001200febe0003027ec12292be000051dc00007c001200febe140069dc00017c00010004f4000000f8010104f4080000f8070089bf000061d704120100000061d705140100000061d700160100000061d701180100010000f4100000f807fc89bf000081be000061d7011a0100ff0081beff03000001001bd501030000100069dc00017c00010049d401010000800080be8002067e0303047e08006ddc00017c00ff0080beffff7fffff02027effff7fff040069dc00017c007e0080be000061d7001c0100c12292be000069dc00007c001200febe0001008b0000febe1d00a5bfc12292be000051dc00007c021200febef70389bf020060d702170100030060d702190100080055dc00007c00820080bef70389bf01003cd700000200020081be0103007e030080be0203027e000100d701000200020020d5000206000203027e000052dc00007c00f70389bf040069dc00007c00c12292be000051dc00007c001200febef70389bf010060d7001d01007e017e8c000060d7001b0100080055dc00007c01040051dc00007c039f0081be00010081000101869b0082be0102018500010081850081be00010186000061d7011e0100810080be010008bff70789bf02030a7e0103087e28006ddc00047c00f70389bf0303087e240069dc00047c00200069dc00037c0018006ddc00017c00000061d7002001007e0092bec100febe000069dc00007c001200febe3f01a2bf0a00a0bf300055dc00007c01380051dc00007c00f70789bf28006ddc00017c00f70389bf240069dc00007c00c12292be000051dc00007c011200febef70389bf0e0060d7010101000d0060d7010301000c0060d7010501000a0060d7010701000b0060d701090100040060d7010f0100050060d701110100000060d7010b0100010060d7010d0100140051dc00007c1f240051dc00007c02280055dc00007c03f70389bf5c006ddc00037c00800083bec10082be00001fd702060000480069dc00007c00810082be010061d70222010003001dd500050000a00083be020041d403070000000001d500070a00820082be000018d502000200580069dc00007c000000ccda00020000980188be000086be010080be080087be090081be0607088000010082000089be010061d708240100010061d709260100004780be00ff00809c0d000001ff018200000000010061d700280100010061d7012a0100c12292be000069dc00017c001200febe0203027e00499ebe140051dc00007c1fc12292be000051dc00007c011200febef70389bf0e0060d7010101000d0060d7010301000c0060d7010501000a0060d7010701000b0060d701090100080060d701250100090060d701270100040060d7010f0100050060d701110100000060d701290100010060d7012b01000003027e480051dc00007c00f70389bf03001dd500050000060041d403070000000001d500071a00000018d502000200540069dc00007c000000ccda0001000000499ebe140051dc00007c1fc12292be000051dc00007c011200febef70389bf0e0060d7010101000d0060d7010301000c0060d7010501000a0060d7010701000b0060d701090100080060d701250100090060d701270100040060d7010f0100050060d701110100000060d701290100010060d7012b01000003027e480051dc00007c00840086bef70389bf03001dd5000d0000060041d403070000000001d500071a00000018d502000200500069dc00007c000000ccda0001000000499ebe140051dc00007c1fc12292be000051dc00007c011200febef70389bf0e0060d7010101000d0060d7010301000c0060d7010501000a0060d7010701000b0060d701090100080060d701250100090060d701270100040060d7010f0100050060d701110100000060d701290100010060d7012b01000003027e480051dc00007c00880086bef70389bf03001dd5000d0000060041d403070000000001d500071a00000018d5020002004c0069dc00007c000000ccda0001000000499ebe140051dc00007c1fc12292be000051dc00007c011200febef70389bf0e0060d7010101000d0060d7010301000c0060d7010501000a0060d7010701000b0060d701090100080060d701250100090060d701270100040060d7010f0100050060d701110100000060d701290100010060d7012b01000003027e480051dc00007c00900086bef70389bf03001dd5000d0000030041d403070000000001d500070e00000018d502000200440069dc00007c000000ccda0001000000499ebec12292be000051dc00007c011200febef70389bf050060d7012301000003027ec12292be000051dc00007c001200febe000012d401050200f70389bf000061d7002c0100000081be000061d7012e0100001884bec10080be800081be040507bf0202027e0302047e000061d7013001003c006ddc00017c00000061d7003201007e0092bec100febe000069dc00007c001200febe1701a2bff101a0bfc12292be000051dc00007c001200febef70389bf000060d7001b0100010060d700210100100051dc00007c01200051dc00007c02180055dc00007c03f70389bf70006ddc00037c006c0069dc00027c00850082be000061d701340100010046d601040404680069dc00017c00010049d401010000ff0080beffff7fffff02027effff7fff640069dc00017c007e0080be000061d700360100c12292be000069dc00007c001200febe0001008b0000febe2000a5bfc12292be000051dc00007c011200febef70389bf020060d701170100030060d701190100680051dc00007c00800080be8002047e0203027e820080bef70389bf01003cd700000200020081be0103007e030080be0203027e000100d701000200020020d5000206000203027e000052dc00007c00f70389bf640069dc00007c00c12292be000051dc00007c001200febef70389bf000060d7003701007e007e8c6c0051dc00007c01700055dc00007c02640051dc00007c04f70389bf840069dc00047c0001001dd4040302007c006ddc00027c00780069dc00017c007e0080be000061d700380100c12292be000069dc00007c001200febe0001008b0000febe4a00a5bfc12292be000051dc00007c001200febef70389bf0e0060d7000101000d0060d7000301000c0060d7000501000a0060d7000701000b0060d700090100040060d7000f0100050060d700110100000060d7000b0100010060d7000d0100700055dc00007c04840051dc00007c036c0051dc00007c01140051dc00007c1f680051dc00007c06800082be8002007e00030e7ef70389bf88006ddc00067c00980186be000082be010080be060083be070081be0203088000010082000089be004780be00ff00806407000001ff0182000000000303007e00499ebe880055dc00007c0100030c7e0603007e000012d4030d0200f70389bf02030c7e0503067e030001d5030d02000103047e0403027e010001d5010502000303047e7c006ddc00017c00780069dc00007c00c12292be000051dc00007c001200febef70389bf020060d7003901007e027e8c010060d7001f0100000060d7003501007c0055dc00007c01780051dc00007c03810082be00020081000106bff70789bf02030a7e0103087e30006ddc00047c00f70389bf0303087e380069dc00047c00200069dc00037c0018006ddc00017c00000061d7002001007e0092bec100febe000069dc00007c001200febee2fda2bf1fffa0bfc12292be000051dc00007c001200febef70389bf000060d7003b01007e007e8c010060d7003d0100900055dc00007c01800080be000061d701300100f70389bf3c006ddc00017c00000061d700320100c12292be000069dc00007c001200febef700a0bfc12292be000051dc00007c031200febef70389bf010060d7032d0100000060d7033f01005c0055dc00007c00c12292bea00051dc00007c021200febe007e0091017e018b0001008cf70389bf020061d700000100c12292bea00069dc00027c001200febe98006ddc00007c002f01a0bfc12292be000051dc00007c001200febef70389bf010060d7002f0100100051dc00007c01440051dc00007c024c0051dc00007c05500051dc00007c06540051dc00007c07580051dc00007c08c12292bea00051dc00007c091200febe5c0055dc00007c0aff0080beffffff7fc10082be000083be030080bef70389bf0b03067e010084be030001d500061200020080be0a03087e040001d5000806000403147e0303167ea00080be090061d700020100c12292bea00069dc00097c001200febe0b031a7e0a03187e0c003dd7001802000c03127e0000ccda0804000d0000ccda080900080102187e0c03127e07fc89bf08003cd700100200800081be0102187e0c031c7e0903187e0e031e7e0c001cd50c1f02000d03127e08001cd5081302000c03127e020051d4081502000903147e030001d503150a00040001d504110a000403127e0303147e0a03187e0903167e0b003dd7001602000b03107e0000ccda0704000c0000ccda070800070202167e0b03107e07fc89bf07003cd7000e02000102167e0b031a7e0803167e0d031c7e0b001cd50b1d02000c03107e07001cd5071102000b03107e020051d4071302000803127e030001d503130a00040001d5040f0a000403107e0303127e0903167e0803147e0a003dd7001402000a030e7e0000ccda0604000b0000ccda060700060202147e0a030e7e07fc89bf06003cd7000c02000102147e0a03187e0703147e0c031a7e0a001cd50a1b02000b030e7e06001cd5060f02000a030e7e020051d4061102000703107e030001d503110a00040001d5040d0a0004030e7e0303107e0803147e0703127e09003dd70012020009030c7e0000ccda0504000a0000ccda050600050202127e09030c7e07fc89bf05003cd7000a02000102127e0903167e0603127e0b03187e09001cd5091902000a030c7e05001cd5050d020009030c7e020051d4050f020006030e7e030001d5030f0a00040001d5040b0a0004030a7e03030c7e0603107e05030e7eac006ddc00077c0005003dd7000a02000503067e0000ccda0204000407fc89bfa80069dc00047c000000ccda0203000207fc89bfa40069dc00027c00800080be00004ad4010100000202027e0302047e000061d7013c010090006ddc00017c007e0081be0100008b0001018d000061d7013a0100c12292be000069dc00007c001200febe0000febef1fea5bf2200a0bfc12292be000051dc00007c031200febef70389bf010060d703330100000060d703310100c12292bea00051dc00007c021200febe3c0055dc00007c00030061d7003e0100c12292be000069dc00037c001200febe7e016a8bf70789bf020061d700000100c12292bea00069dc00027c001200febef70389bf98006ddc00007c00e8fea4bf3200a0bfc12292be000051dc00007c021200febeac0055dc00007c04a80051dc00007c00a40051dc00007c060002027e01030e7ea00080bef70389bf06003cd7000c0200800080be8002067e0303027e0103107e0703067e03001cd5031102000003027e0603007e00001cd5000302000303027e000051d40009020001030c7e0503067e030001d5030d02000003027e0403007e000001d5000302000303027ec10080be7e0080be020061d7003c0100c12292be000069dc00027c001200febe90006ddc00007c009cfea0bfc12292bea00051dc00007c001200febef70389bf010060d700010100980055dc00007c01f70389bfb4006ddc00017c007e0080be000061d700040100c12292bea00069dc00007c001200febe0001008b0000febe0f00a5bfc12292be000051dc00007c001200febef70389bf000060d700130100010060d700150100b40055dc00007c018002007ef70389bf00006edc00010000c12292bea00051dc00007c001200febef70389bf000060d7000501007e007e8cc12292be000051dc00007c011200febe0000b0bf000089bf210080be2000a1be010010d501030200000010d500010200000010d5000302000000a1be1e4880be00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf00009fbf0600000000000000f0080000000000000b0000000000000018000000000000000500000000000000040b0000000000000a00000000000000fc00000000000000f5feff6f00000000280a0000000000000400000000000000940a00000000000000000000000000000000000000000000414d4420636c616e672076657273696f6e2031372e302e30202868747470733a2f2f6769746875622e636f6d2f526164656f6e4f70656e436f6d707574652f6c6c766d2d70726f6a65637420726f632d362e302e3020323334383320373230386538643135666266323138646562373434383365613863353439633637636134393835652900636c616e672076657273696f6e2031392e302e30676974004c696e6b65723a204c4c442031392e302e300000636c616e672076657273696f6e2031382e302e3000000000000000000000000000000000000000000000000000000100000001000600480c00000000000004000000000000001400000001000600440c00000000000001000000000000002b00000001000a0070ba00000000000090a8010000000000370000000200070050850000000000002c0000000000000043000000020007007c8500000000000000010000000000004f0000000200070044980000000000002c000000000000003e01000001020600400c0000000000000400000000000000720100000002080000aa00000000000000000000000000005f00000022030700001d000000000000fc000000000000007500000022030700fc1d000000000000c0000000000000008d00000022030700bc1e0000000000004815000000000000a6000000220307000434000000000000b007000000000000bc00000022030700b43b000000000000a809000000000000ce000000220307005c45000000000000c403000000000000e2000000220307002049000000000000b83a000000000000f100000022030700d883000000000000780100000000000007010000220307007c8600000000000060000000000000001f01000012030700008700000000000044110000000000002d01000011030600000c00000000000040000000000000005601000011000a0000630200000000000100000000000000002e6e6f7465002e64796e73796d002e676e752e68617368002e68617368002e64796e737472002e726f64617461002e74657874002e64796e616d6963002e72656c726f5f70616464696e67002e627373002e636f6d6d656e74002e73796d746162002e7368737472746162002e73747274616200005f5f6f636c635f4142495f76657273696f6e005f5f6f636c635f7761766566726f6e7473697a653634005f5f756e6e616d65645f31005f5f756e6e616d65645f32005f5f756e6e616d65645f33005f5f6f636d6c5f666d61785f663332005f5f6f636b6c5f6465766d656d5f72657175657374005f5f6f636b6c5f686f737463616c6c5f70726576696577005f5f6f636b6c5f686f737463616c6c5f696e7465726e616c005f5f6f636b6c5f6873615f7369676e616c5f616464005f5f6f636b6c5f646d5f696e69745f7631005f5f6f636b6c5f6765745f6c6f63616c5f6964005f5f6f636b6c5f646d5f7472696d005f5f6f636b6c5f6163746976656c616e655f753332005f5f6f636b6c5f73616e6974697a65725f7265706f7274006172676d61785f463332493634006172676d61785f4633324936342e6b64006c6c766d2e616d6467636e2e6162692e76657273696f6e005f5f6869705f637569645f36613435623435373236356465323731005f44594e414d494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000007000000020000000000000038020000000000003802000000000000b806000000000000000000000000000004000000000000000000000000000000070000000b0000000200000000000000f008000000000000f00800000000000038010000000000000500000001000000080000000000000018000000000000000f000000f6ffff6f0200000000000000280a000000000000280a0000000000006c0000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000940a000000000000940a00000000000070000000000000000200000000000000040000000000000004000000000000001f000000030000000200000000000000040b000000000000040b000000000000fc0000000000000000000000000000000100000000000000000000000000000027000000010000000200000000000000000c000000000000000c0000000000004c000000000000000000000000000000400000000000000000000000000000002f000000010000000600000000000000001d000000000000000d000000000000007d0000000000000000000000000000000100000000000000000000000000003500000006000000030000000000000000aa000000000000008a00000000000070000000000000000500000000000000080000000000000010000000000000003e00000008000000030000000000000070aa000000000000708a00000000000090050000000000000000000000000000010000000000000000000000000000004d00000008000000030000000000000070ba000000000000708a00000000000091a8010000000000000000000000000008000000000000000000000000000000520000000100000030000000000000000000000000000000708a000000000000c7000000000000000000000000000000010000000000000001000000000000005b0000000200000000000000000000000000000000000000388b000000000000f8010000000000000e0000000900000008000000000000001800000000000000630000000300000000000000000000000000000000000000308d00000000000075000000000000000000000000000000010000000000000000000000000000006d0000000300000000000000000000000000000000000000a58d0000000000007b01000000000000000000000000000001000000000000000000000000000000"> : vector<37600xi8>
+          }>
+        ]
+      })
+      attributes {subgroupSize = 32, workgroup_size = [32 : index, 1 : index, 1 : index]}
+    util.return %4 : tensor<1xi64>
+  }
+  // data = dense<"0x7f454c was generated by generate_hsaco.sh under filename.hex. It uses
+  // xxd -p -c 1000000 filename.hsaco > filename.hex to generate the hexdump. and the shape is
+  // vector<floor_div(character length of the hex, 2) x i8>.
+
+  // Custom matcher for argmax operations equivalent to the custom kernel. This
+  // matcher will be run one-by-one on all operations contained within the
+  // target function. On success, it will return the handle to the matched
+  // argmax operation.
+  transform.named_sequence @match_argmax(%generic: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    // Fail fast on non-linalg generics.
+    transform.match.operation_name %generic ["linalg.generic"] : !transform.any_op
+    %matched = transform.match.structured failures(propagate) %generic : (!transform.any_op) -> (!transform.any_op) {
+    ^bb1(%argmax: !transform.any_op):
+      // Verify that the rank (i.e. number of loops) of the linalg op is 2,
+      // with one parallel iterator and one reduction iterator.
+      // TODO: Add optionality for the parallel dimensions.
+      %c2 = transform.param.constant 2 : i64 -> !transform.param<i64>
+      %rank = transform.match.structured.rank %argmax : (!transform.any_op) -> !transform.param<i64>
+      transform.match.param.cmpi eq %rank, %c2 : !transform.param<i64>
+      transform.match.structured.dim %argmax[0] {parallel} : !transform.any_op
+      transform.match.structured.dim %argmax[-1] {reduction} : !transform.any_op
+
+      // Verify a single input (target vector to compute the argmax of) and two
+      // outputs, one for the maximum value and one for the index.
+      %c1 = transform.param.constant 1 : i64 -> !transform.param<i64>
+      %n_inputs = transform.match.structured.num_inputs %argmax : (!transform.any_op) -> !transform.param<i64>
+      transform.match.param.cmpi eq %n_inputs, %c1 : !transform.param<i64>
+      %n_outputs = transform.match.structured.num_inits %argmax : (!transform.any_op) -> !transform.param<i64>
+      transform.match.param.cmpi eq %n_outputs, %c2 : !transform.param<i64>
+  
+      transform.match.structured.yield %argmax : !transform.any_op 
+    }
+
+    // Verify the operand shapes of the linalg op. For example, in the below,
+    // dim 0 must be statically 1, and dim 1 must be statically divisible by 64.
+    %in0 = transform.get_operand %matched[0] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %in0 = tensor<1x?xf32> : !transform.any_value
+    transform.iree.match.dim_is_multiple_of %in0[1], 64 : !transform.any_value
+    %out0 = transform.get_operand %matched[1] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %out0 = tensor<1xf32> : !transform.any_value
+    %out1 = transform.get_operand %matched[2] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %out1 = tensor<1xi64> : !transform.any_value
+
+    // Verify the region of the argmax op. This does a structural comparison of
+    // region(s) of the payload operation against the single operation contained
+    // within the body of this operation. This does no verification of other
+    // input types/attributes. This is because typically for kernel matching,
+    // the most important part to get exactly right is the inner loop. Otherwise
+    // small variations to shape information and iterator counts and such are
+    // better suited for more general matchers.
+    transform.iree.match.regions %matched : !transform.any_op {
+      ^bb0(%target: tensor<1x?xf32>, %empty_max: tensor<1xf32>, %empty_idx: tensor<1xi64>):
+        %5:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                                affine_map<(d0, d1) -> (d0)>,
+                                                affine_map<(d0, d1) -> (d0)>],
+                               iterator_types = ["parallel", "reduction"]}
+                               ins(%target : tensor<1x?xf32>)
+                               outs(%empty_max, %empty_idx : tensor<1xf32>, tensor<1xi64>) {
+        ^bb0(%in: f32, %out: f32, %out_0: i64):
+          %6 = linalg.index 1 : index
+          %7 = arith.index_cast %6 : index to i64
+          %8 = arith.maximumf %in, %out : f32
+          %9 = arith.cmpf ogt, %in, %out : f32
+          %10 = arith.select %9, %7, %out_0 : i64
+          linalg.yield %8, %10 : f32, i64
+        } -> (tensor<1xf32>, tensor<1xi64>)
+    }
+    transform.yield %generic : !transform.any_op
+  }
+
+  // Rewrite callback for `transform.foreach_match`. The input signature for
+  // this sequence must match exactly with the outputs of the matcher. In this
+  // case we just take the argmax as an input, import the entry point for the
+  // custom kernel authored above, and replace the users of the argmax with a
+  // call to the function.
+  transform.named_sequence @cast_and_call_argmax(%argmax: !transform.any_op {transform.readonly}) {
+    %module = transform.util.get_nearest_symbol_table %argmax : (!transform.any_op) -> !transform.any_op
+    %func = transform.util.import_symbol @argmax_1d_f32_entry_point into %module if undefined : (!transform.any_op) -> !transform.any_op
+    %ins = transform.get_operand %argmax[0] : (!transform.any_op) -> !transform.any_value
+    %outs = transform.get_result %argmax[1] : (!transform.any_op) -> !transform.any_value
+    transform.util.cast_and_call %func(%ins) -> %outs before %argmax {
+          // This specifies how to resolve type mismatches between the arguments
+          // of the function and the inputs to the argmax. In this example, the
+          // only casts this will generate are same-rank tensor casts that drop
+          // static information.
+          transform.type_conversion.tensor.cast_shape_dynamic_dims
+      } : (!transform.any_op, !transform.any_value, !transform.any_value, !transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+
+  // Entry point for the transform interpreter, nested on the full module. This
+  // is because the rewrites needed for importing the custom kernel needs to
+  // add a new symbol to the module's symbol table.
+  transform.named_sequence @__transform_main(%module: !transform.any_op) {
+    // Gather the set of functions within the module.
+    %funcs = transform.structured.match ops{["util.func"]} in %module : (!transform.any_op) -> !transform.any_op   
+    // For each function in the module, run the matcher on all contained
+    // operations.
+    transform.foreach %funcs : !transform.any_op {
+      ^bb1(%func: !transform.any_op):
+        transform.foreach_match in %func
+            // <matcher name> -> <rewriter name>
+            // Multiple matcher-action pairs can be specified comma separated,
+            // here we are only doing a single kind of match and replace.
+            //
+            // Note that the operations within the module are walked in
+            // post-order, meaning actions must be very careful in their
+            // replacements not to modify successors of operations. Nested
+            // regions and DAG roots will be visited last so it is safest to
+            // do matching + replacement on the root of the DAG rather than
+            // trying to look ahead. The other option is to avoid dce/cse until
+            // after the walk is complete.
+            @match_argmax -> @cast_and_call_argmax
+          : (!transform.any_op) -> (!transform.any_op)
+    }
+    // Cleanup now dead instances of argmax.
+    transform.apply_dce to %module : !transform.any_op
+    transform.yield
+  }
+}
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index f9dfcc2dc..81e92abc8 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -473,7 +473,19 @@ def evict_kvcache_space(self):
             )
             ukernel_supported_arch = {"gfx90a", "gfx940", "gfx1030", "gfx1100"}
             if target_triple in ukernel_supported_arch:
-                flags.extend(["--iree-rocm-enable-ukernels=argmax"])
+                flags.extend(
+                    [
+                        "--iree-rocm-enable-ukernels=argmax",
+                        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-preprocessing-pad-to-intrinsics))",
+                        "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
+                    ]
+                )
+                if os.path.exists("llama_argmax_td_spec.mlir"):
+                    flags.extend(
+                        [
+                            "--iree-preprocessing-transform-spec-filename=llama_argmax_td_spec.mlir",
+                        ]
+                    )
         elif device == "cuda":
             flags.extend(
                 [
@@ -500,7 +512,6 @@ def evict_kvcache_space(self):
             return blob_name
         return module_str, tokenizer
 
-
 if __name__ == "__main__":
     args = parser.parse_args()
     mod_str, _ = export_transformer_model(

From b44cd162a880b7eb432e79e6c7880e52649dc80d Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 21:56:11 -0500
Subject: [PATCH 036/174] Remove --verify=false from VAE compilation flags and
 move pipeline IRs

---
 .../custom_models/sd_inference/utils.py       |   2 +-
 .../sdxl_inference/pipeline_ir.py             |  95 ++++++++++++++++
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 102 ++----------------
 .../sdxl_inference/sdxl_scheduled_unet.py     |   8 +-
 4 files changed, 106 insertions(+), 101 deletions(-)
 create mode 100644 models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 5690e5f4a..8fd8fcc63 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -22,7 +22,7 @@
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
     ],
     "clip": [],
-    "vae": ["--verify=false"],
+    "vae": [],
 }
 
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
new file mode 100644
index 000000000..72bf554a9
--- /dev/null
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -0,0 +1,95 @@
+sdxl_pipeline_bench_f16 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+
+  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
+    return %image : tensor<1x3x1024x1024xf16>
+  } 
+}
+"""
+
+sdxl_pipeline_bench_f32 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+
+  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf32>, %guidance_scale: tensor<1xf32>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf32> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
+      scf.yield %inner : tensor<1x4x128x128xf32>
+    }
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32>
+    return %image : tensor<1x3x1024x1024xf32>
+  } 
+}
+"""
+
+sdxl_sched_unet_bench_f16 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  
+  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf16>, %p_embeds: tensor<2x64x2048xf16>, %t_embeds: tensor<2x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x128x128xf16> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    return %res : tensor<1x4x128x128xf16>
+  } 
+}
+"""
+
+sdxl_sched_unet_bench_f32 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  
+  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf32>, %p_embeds: tensor<2x64x2048xf32>, %t_embeds: tensor<2x1280xf32>, %guidance_scale: tensor<1xf32>) -> tensor<1x4x128x128xf32> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg_s = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg_s, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
+      scf.yield %inner : tensor<1x4x128x128xf32>
+    }
+    return %res : tensor<1x4x128x128xf32>
+  } 
+}
+"""
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index d0be2c004..ef25ee0ec 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -13,6 +13,12 @@
 )
 import iree.runtime as ireert
 from turbine_models.custom_models.sd_inference import utils
+from turbine_models.custom_models.sdxl_inference.pipeline_ir import (
+    sdxl_sched_unet_bench_f32,
+    sdxl_sched_unet_bench_f16,
+    sdxl_pipeline_bench_f32,
+    sdxl_pipeline_bench_f16,
+)
 from turbine_models.utils.sdxl_benchmark import run_benchmark
 from turbine_models.model_runner import vmfbRunner
 from transformers import CLIPTokenizer
@@ -54,102 +60,6 @@
     "pipeline": None,
 }
 
-sdxl_pipeline_bench_f16 = """
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-
-  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
-    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
-      scf.yield %inner : tensor<1x4x128x128xf16>
-    }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
-    return %image : tensor<1x3x1024x1024xf16>
-  } 
-}
-"""
-
-sdxl_pipeline_bench_f32 = """
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-
-  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf32>, %guidance_scale: tensor<1xf32>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf32> {
-    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
-      scf.yield %inner : tensor<1x4x128x128xf32>
-    }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32>
-    return %image : tensor<1x3x1024x1024xf32>
-  } 
-}
-"""
-
-sdxl_sched_unet_bench_f16 = """
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  
-  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf16>, %p_embeds: tensor<2x64x2048xf16>, %t_embeds: tensor<2x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x128x128xf16> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
-      scf.yield %inner : tensor<1x4x128x128xf16>
-    }
-    return %res : tensor<1x4x128x128xf16>
-  } 
-}
-"""
-
-sdxl_sched_unet_bench_f32 = """
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  
-  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf32>, %p_embeds: tensor<2x64x2048xf32>, %t_embeds: tensor<2x1280xf32>, %guidance_scale: tensor<1xf32>) -> tensor<1x4x128x128xf32> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg_s = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg_s, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
-      scf.yield %inner : tensor<1x4x128x128xf32>
-    }
-    return %res : tensor<1x4x128x128xf32>
-  } 
-}
-"""
-
 
 class SharkSDXLPipeline:
     def __init__(
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 210a0844c..36f92f8ae 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -254,11 +254,11 @@ def run_forward(
 
 
 def export_pipeline_module(args):
-    from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
-        sdxl_pipeline_bench_f16,
-        sdxl_pipeline_bench_f32,
-        sdxl_sched_unet_bench_f16,
+    from turbine_models.custom_models.sdxl_inference.pipeline_ir import (
         sdxl_sched_unet_bench_f32,
+        sdxl_sched_unet_bench_f16,
+        sdxl_pipeline_bench_f32,
+        sdxl_pipeline_bench_f16,
     )
 
     pipeline_file = (

From f23f56db37290f37ceac2ef31a5ad4ea877a30e2 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 21:57:22 -0500
Subject: [PATCH 037/174] Remove unused import and revert change to sdxl unet
 correctness tolerance.

---
 models/turbine_models/custom_models/stateless_llama.py | 1 -
 models/turbine_models/tests/sdxl_test.py               | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 81e92abc8..18b833c6e 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -2,7 +2,6 @@
 import sys
 import re
 import json
-import copy
 from turbine_models.turbine_tank import turbine_tank
 
 os.environ["TORCH_LOGS"] = "dynamic"
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index aab83657c..8506ff3b8 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -319,7 +319,7 @@ def test02_ExportUnetModel(self):
                 tracy_profile=arguments["tracy_profile"],
             )
         rtol = 4e-2
-        atol = 4e-1
+        atol = 4e-2
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 

From 65773458026b77103182d25ce3da9af90b5b13b2 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 24 Apr 2024 21:57:58 -0500
Subject: [PATCH 038/174] Formatting

---
 .../turbine_models/custom_models/sdxl_inference/pipeline_ir.py  | 2 +-
 models/turbine_models/custom_models/stateless_llama.py          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index 72bf554a9..0348666d3 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -92,4 +92,4 @@
     return %res : tensor<1x4x128x128xf32>
   } 
 }
-"""
\ No newline at end of file
+"""
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 18b833c6e..c3f8a9050 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -511,6 +511,7 @@ def evict_kvcache_space(self):
             return blob_name
         return module_str, tokenizer
 
+
 if __name__ == "__main__":
     args = parser.parse_args()
     mod_str, _ = export_transformer_model(

From 393bfe34816c18b42f8b9404d0395d8df50e73cd Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Thu, 25 Apr 2024 02:24:29 -0500
Subject: [PATCH 039/174] Relax unet numerics tolerance in sdxl tests.

---
 models/turbine_models/tests/sdxl_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index 8506ff3b8..aab83657c 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -319,7 +319,7 @@ def test02_ExportUnetModel(self):
                 tracy_profile=arguments["tracy_profile"],
             )
         rtol = 4e-2
-        atol = 4e-2
+        atol = 4e-1
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 

From 1b0bff620c522d6c4041c66c52cec09429fed99f Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 1 May 2024 11:15:08 -0500
Subject: [PATCH 040/174] Update sdxl compile flags.

---
 .../custom_models/sd_inference/utils.py          | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 8fd8fcc63..d6c00cbef 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -11,18 +11,26 @@
 
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
 amdgpu_flags = {
-    "all": [],
-    "unet": [
+    "all": [
         "--iree-global-opt-propagate-transposes=true",
         "--iree-opt-outer-dim-concat=true",
         "--iree-vm-target-truncate-unsupported-floats",
         "--iree-llvmgpu-enable-prefetch=true",
         "--iree-opt-data-tiling=false",
+        "--iree-flow-enable-aggressive-fusion",
+        "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+        "--iree-opt-aggressively-propagate-transposes=true",
         "--iree-codegen-gpu-native-math-precision=true",
-        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+
+    ],
+    "unet": [
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
     ],
     "clip": [],
-    "vae": [],
+    "vae": [
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
+    ],
 }
 
 

From 1cb45b0abcc2518a3b094084cd5e551bd2254e1a Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 1 May 2024 12:24:46 -0500
Subject: [PATCH 041/174] Update default_mfma_attn_spec.mlir

---
 .../custom_models/sd_inference/default_mfma_attn_spec.mlir  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir b/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
index 4bbe76a1b..5f9b9ffba 100644
--- a/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
+++ b/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
@@ -157,7 +157,7 @@ module attributes { transform.with_named_sequence } {
     // Step 5. Pre-process the contract and transfer ops to put it in the right form.
     // ===========================================================================
     transform.apply_patterns to %memref_func {
-      transform.apply_patterns.iree.fold_arith_ext_into_contraction
+      transform.apply_patterns.vector.fold_arith_extension
     } : !transform.any_op
 
     // Step 6. Post-bufferization vector distribution
@@ -359,7 +359,7 @@ module attributes { transform.with_named_sequence } {
     // Step 5. Pre-process the contract and transfer ops to put it in the right form.
     // ===========================================================================
     transform.apply_patterns to %memref_func {
-      transform.apply_patterns.iree.fold_arith_ext_into_contraction
+      transform.apply_patterns.vector.fold_arith_extension
     } : !transform.any_op
 
     // Step 6. Post-bufferization vector distribution
@@ -476,4 +476,4 @@ module attributes { transform.with_named_sequence } {
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
-} ////  module
\ No newline at end of file
+} ////  module

From d59b90910166db1bbc800a232ed49e320021f5ea Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 1 May 2024 12:26:47 -0500
Subject: [PATCH 042/174] Reorganize some compile flags.

---
 .../turbine_models/custom_models/sd_inference/utils.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index d6c00cbef..58b78af6f 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -17,20 +17,16 @@
         "--iree-vm-target-truncate-unsupported-floats",
         "--iree-llvmgpu-enable-prefetch=true",
         "--iree-opt-data-tiling=false",
-        "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
         "--iree-codegen-gpu-native-math-precision=true",
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
 
     ],
-    "unet": [
-        "--iree-codegen-llvmgpu-use-vector-distribution=true",
-    ],
+    "unet": ["--iree-flow-enable-aggressive-fusion"],
     "clip": [],
-    "vae": [
-        "--iree-codegen-llvmgpu-use-vector-distribution=true",
-    ],
+    "vae": ["--iree-flow-enable-aggressive-fusion"],
 }
 
 

From 660b563959d5d4128350eb8a5fe2fb4ec5a65370 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 1 May 2024 12:28:31 -0500
Subject: [PATCH 043/174] formatting

---
 models/turbine_models/custom_models/sd_inference/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 58b78af6f..c7c134885 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -22,7 +22,6 @@
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
-
     ],
     "unet": ["--iree-flow-enable-aggressive-fusion"],
     "clip": [],

From c3cbf9371317721eaf4ec7f8e4b2071269fb47d7 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 1 May 2024 13:07:47 -0500
Subject: [PATCH 044/174] Update spec for
 nod-ai/SHARK-TestSuite@072e8b7f3140b31669257e6042dc1f02f2a4e2cc

---
 .../sd_inference/default_mfma_attn_spec.mlir           | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir b/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
index 5f9b9ffba..d5c93011d 100644
--- a/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
+++ b/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
@@ -193,9 +193,7 @@ module attributes { transform.with_named_sequence } {
     transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
 
     %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %distribute_func : !transform.any_op
-
-    %distribute_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %distribute_func_2 = transform.iree.amdgpu_distribute_vectors %distribute_func : (!transform.any_op) -> !transform.any_op
 
     transform.apply_patterns to %distribute_func_2 {
       transform.apply_patterns.canonicalization
@@ -397,9 +395,7 @@ module attributes { transform.with_named_sequence } {
     transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
 
     %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %distribute_func : !transform.any_op
-
-    %distribute_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %distribute_func_2 = transform.iree.amdgpu_distribute_vectors %distribute_func : (!transform.any_op) -> !transform.any_op
 
     transform.apply_patterns to %distribute_func_2 {
       transform.apply_patterns.canonicalization
@@ -476,4 +472,4 @@ module attributes { transform.with_named_sequence } {
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
-} ////  module
+} ////  module
\ No newline at end of file

From 54c5283c6a1976677a849659872a271a8ec1a190 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 1 May 2024 13:48:36 -0500
Subject: [PATCH 045/174] Update README.md

---
 .../custom_models/sdxl_inference/README.md     | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/README.md b/models/turbine_models/custom_models/sdxl_inference/README.md
index 4dc9107a4..f6cea1b21 100644
--- a/models/turbine_models/custom_models/sdxl_inference/README.md
+++ b/models/turbine_models/custom_models/sdxl_inference/README.md
@@ -20,9 +20,12 @@ Linux:
 python -m venv turbine_venv
 source turbine_venv/bin/activate
 python -m pip install --upgrade pip
-pip install -r core/pytorch-cpu-requirements.txt
-pip install --pre --upgrade -r core/requirements.txt
-pip install --pre -e core
+cd ..
+git clone https://iree-org/iree-turbine
+cd iree-turbine
+pip install -r pytorch-cpu-requirements.txt
+pip install -e .
+cd ../SHARK-Turbine
 pip install --pre --upgrade -e models -r models/requirements.txt
 ```
 
@@ -31,9 +34,12 @@ Windows:
 python -m venv turbine_venv
 turbine_venv/Scripts/activate
 python -m pip install --upgrade pip
-pip install -r core/pytorch-cpu-requirements.txt
-pip install --pre --upgrade -r core/requirements.txt
-pip install --pre -e core
+cd ..
+git clone https://iree-org/iree-turbine
+cd iree-turbine
+pip install -r pytorch-cpu-requirements.txt
+pip install -e .
+cd ../SHARK-Turbine
 pip install --pre --upgrade -e models -r models/requirements.txt
 ```
 

From 30469fee473b73ad89cbff73fb4b7f6adc4c7441 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 30 Apr 2024 14:00:09 -0500
Subject: [PATCH 046/174] (WIP) Unifies SD/SDXL pipelines

---
 .../custom_models/sd_inference/sd_pipeline.py | 603 ++++++++++++++++++
 1 file changed, 603 insertions(+)
 create mode 100644 models/turbine_models/custom_models/sd_inference/sd_pipeline.py

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
new file mode 100644
index 000000000..647b38d56
--- /dev/null
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -0,0 +1,603 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import copy
+import torch
+import iree.runtime as ireert
+from turbine_models.custom_models.sd_inference import (
+    clip,
+    unet,
+    vae,
+    utils
+)
+from turbine_models.model_runner import vmfbRunner
+from transformers import CLIPTokenizer
+
+from PIL import Image
+import os
+import numpy as np
+import time
+from datetime import datetime as dt
+
+device_list = [
+    "cpu",
+    "vulkan",
+    "cuda",
+    "rocm",
+]
+
+rt_device_list = [
+    "local-task",
+    "local-sync",
+    "vulkan",
+    "cuda",
+    "rocm",
+    "hip",
+]
+
+SUBMODELS = {
+    "clip": None,
+    "scheduler": None,
+    "unet": None,
+    "vae_decode": None,
+}
+
+class SharkSDPipeline:
+    def __init__(
+        self,
+        hf_model_name: str,
+        scheduler_id: str,
+        height: int,
+        width: int,
+        precision: str,
+        max_length: int,
+        batch_size: int,
+        num_inference_steps: int,
+        device: str,
+        iree_target_triple: str,
+        ireec_flags: dict = EMPTY_FLAGS,
+        attn_spec: str = None,
+        decomp_attn: bool = False,
+        pipeline_dir: str = "./shark_vmfbs",
+        external_weights_dir: str = "./shark_weights",
+        external_weights: str = "safetensors",
+        vae_decomp_attn: bool = True,
+    ):
+        self.hf_model_name = hf_model_name
+        self.scheduler_id = scheduler_id
+        self.height = height
+        self.width = width
+        self.precision = precision
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.num_inference_steps = num_inference_steps
+        self.device = device
+        self.iree_target_triple = iree_target_triple
+        self.ireec_flags = ireec_flags if ireec_flags else copy.deepcopy(SUBMODELS)
+        self.attn_spec = attn_spec
+        self.decomp_attn = decomp_attn
+        self.pipeline_dir = pipeline_dir
+        self.external_weights_dir = external_weights_dir
+        self.external_weights = external_weights
+        self.vae_decomp_attn = vae_decomp_attn
+        self.is_sdxl = "xl" in self.hf_model_name
+
+    # FILE MANAGEMENT AND PIPELINE SETUP
+
+    def check_prepared(
+        self,
+        mlirs: dict,
+        vmfbs: dict,
+        weights: dict,
+        interactive: bool = True,
+    ):
+        ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
+        if not ready:
+            if interactive:
+                do_continue = input(
+                    f"\nIt seems you are missing some necessary files. Would you like to generate them now? (y/n)"
+                )
+                if do_continue.lower() != "y":
+                    exit()
+            else:
+                do_continue = "y"
+            if do_continue.lower() == "y":
+                for submodel in vmfbs.keys():
+                    if vmfbs[submodel] == None:
+                        vmfb, weight = self.export_submodel(submodel, input_mlir=mlirs)
+                        vmfbs[submodel] = vmfb
+                        if weights[submodel] is None:
+                            weights[submodel] = weight
+                    elif weights[submodel] is None and "scheduler" not in submodel:
+                        _, weight = self.export_submodel(submodel, weights_only=True)
+                        weights[submodel] = weight
+                ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
+                if ready:
+                    print("All necessary files found. Generating images.")
+                    return vmfbs, weights
+                else:
+                    print("There was an error generating the necessary files.")
+                    exit()
+        else:
+            print("All necessary files found. Loading pipeline.")
+        return vmfbs, weights
+
+    def is_prepared(self, vmfbs, weights):
+        missing = []
+        for key in vmfbs:
+            default_filepath = os.path.join(self.pipeline_dir, key + "_" + self.iree_target_triple + ".vmfb")
+            if vmfbs[key] is not None and os.path.exists(vmfbs[key]):
+                continue
+            elif vmfbs[key] == None and os.path.exists(default_filepath):
+                vmfbs[key] = default_filepath
+            else:
+                missing.append(key + ".vmfb")
+        for w_key in weights:
+            if "scheduler" in w_key:
+                continue
+            if weights[w_key] is not None and os.path.exists(weights[w_key]):
+                continue
+            default_name = os.path.join(
+                self.external_weights_dir, w_key + "." + self.external_weights
+            )
+            if weights[w_key] is None and os.path.exists(default_name):
+                weights[w_key] = os.path.join(default_name)
+            else:
+                missing.append(w_key + "." + self.external_weights)
+        if len(missing) > 0:
+            print(f"Missing files: " + ", ".join(missing))
+            return False, vmfbs, weights
+        else:
+            return True, vmfbs, weights
+
+    def get_mlir_from_turbine_tank(self, submodel, container_name):
+        from turbine_models.turbine_tank import downloadModelArtifacts
+
+        safe_name = utils.create_safe_name(
+            self.hf_model_name,
+            f"_{self.max_length}_{self.height}x{self.width}_{self.precision}_{submodel}.mlir",
+        )
+        mlir_path = downloadModelArtifacts(
+            safe_name,
+            container_name,
+        )
+        return mlir_path
+
+    # IMPORT / COMPILE PHASE
+
+    def get_torch_models(self, submodel):
+        match submodel:
+            case "unet":
+                unet_torch = unet.UnetModel(
+                    self.hf_model_name,
+                    self.height,
+                    self.width,
+                    self.batch_size,
+                    None,
+                    precision=self.precision,
+                )
+                return unet_torch
+            case "vae_decode":
+                if not self.custom_vae:
+                    custom_vae = "madebyollin/sdxl-vae-fp16-fix" if self.precision == "fp16" and self.is_sdxl else None
+                vae_torch = vae.VaeModel(
+                    self.hf_model_name,
+                    custom_vae,
+                )
+                return vae_torch
+
+    def export_submodel(
+        self,
+        submodel: str,
+        input_mlir: str = None,
+        weights_only: bool = False,
+    ):
+        if not os.path.exists(self.pipeline_dir):
+            os.makedirs(self.pipeline_dir)
+        if self.external_weights_dir:
+            if not os.path.exists(self.external_weights_dir):
+                os.makedirs(external_weights_dir, exist_ok=True)
+            vae_external_weight_path = os.path.join(
+                self.external_weights_dir, "vae_decode." + self.external_weights
+            )
+            unet_external_weight_path = os.path.join(
+                self.external_weights_dir, "unet." + self.external_weights
+            )
+            clip_external_weight_path = os.path.join(
+                self.external_weights_dir, "clip." + self.external_weights
+            )
+        elif self.external_weights is None:
+            print(
+                "No external weights type specified using --external_weights, weights for imported .mlir files will not be externalized."
+            )
+            vae_external_weight_path = None
+            unet_external_weight_path = None
+            clip_external_weight_path = None
+        else:
+            print(
+                f"No external weights directory specified using --external_weights_dir, we assume you have your own weights in {self.pipeline_dir}."
+            )
+            external_weights_dir = self.pipeline_dir
+            if not os.path.exists(self.pipeline_dir):
+                os.makedirs(self.pipeline_dir, exist_ok=True)
+            vae_external_weight_path = os.path.join(
+                self.pipeline_dir, "vae_decode." + self.external_weights
+            )
+            unet_external_weight_path = os.path.join(
+                self.pipeline_dir, "unet." + self.external_weights
+            )
+            clip_external_weight_path = os.path.join(
+                self.pipeline_dir, "clip." + self.external_weights
+            )
+        if weights_only:
+            input_mlir = copy.deepcopy(SUBMODELS)
+        match submodel:
+            case "scheduled_unet":
+                if input_mlir[submodel]:
+                    unet_torch = None
+                else:
+                    unet_torch = self.get_torch_models("scheduled_unet")
+                    
+                unet_vmfb = unet.export_unet_model(
+                    unet_torch,
+                    self.hf_model_name,
+                    self.batch_size,
+                    self.height,
+                    self.width,
+                    self.precision,
+                    self.max_length,
+                    None,
+                    "vmfb",
+                    self.external_weights,
+                    unet_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["unet"],
+                    self.decomp_attn,
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    attn_spec=self.attn_spec,
+                    input_mlir=input_mlir["unet"],
+                    weights_only=weights_only,
+                )
+                return unet_vmfb, unet_external_weight_path
+            case "vae_decode":
+                if not input_mlir[submodel]:
+                    vae_torch = self.get_torch_models("vae_decode")
+                else:
+                    vae_torch = None
+                vae_decode_vmfb = vae.export_vae_model(
+                    vae_torch,
+                    self.hf_model_name,
+                    self.batch_size,
+                    self.height,
+                    self.width,
+                    self.precision,
+                    "vmfb",
+                    self.external_weights,
+                    vae_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["vae"],
+                    "decode",
+                    self.vae_decomp_attn,
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    attn_spec=self.attn_spec,
+                    input_mlir=input_mlir["vae_decode"],
+                    weights_only=weights_only,
+                )
+                return vae_decode_vmfb, vae_external_weight_path
+            case "clip":
+                _, clip_vmfb = clip.export_combined_clip(
+                    self.hf_model_name,
+                    None,
+                    self.max_length,
+                    self.precision,
+                    "vmfb",
+                    self.external_weights,
+                    clip_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["clip"],
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    input_mlir=input_mlir["clip"],
+                    attn_spec=self.attn_spec,
+                    weights_only=weights_only,
+                )
+                return clip_vmfb, clip_external_weight_path
+
+    # LOAD
+
+    def load_pipeline(
+        self,
+        vmfbs: dict,
+        weights: dict,
+        rt_device: str = "local-task",
+        compiled_pipeline: bool = False,
+    ):
+        self.runners = {}
+        runners = {}
+        runners["tokenizers"] = []
+        runners["tokenizers"] += CLIPTokenizer.from_pretrained(
+            self.hf_model_name,
+            subfolder="tokenizer",
+        )
+        if self.is_sdxl:
+            runners["tokenizers"] += CLIPTokenizer.from_pretrained(
+                self.hf_model_name,
+                subfolder="tokenizer_2",
+            ),
+            
+        runners["clip"] = vmfbRunner(
+            rt_device, vmfbs["clip"], weights["clip"]
+        )
+        runners["scheduler"] = vmfbRunner(
+            rt_device, vmfbs["scheduler"], weights["scheduler"]
+        )
+        runners["unet"] = vmfbRunner(
+            rt_device, vmfbs["unet"], weights["unet"]
+        )
+        runners["vae_decode"] = vmfbRunner(
+            rt_device, vmfbs["vae_decode"], weights["vae_decode"]
+        )
+        self.runners = runners
+        self.compiled_pipeline = False
+        print("Successfully loaded pipeline.")
+
+    # RUN
+
+    def generate_images(
+        self,
+        prompt: str,
+        negative_prompt: str = "",
+        batch_count: int = 1,
+        guidance_scale: float = 7.5,
+        seed: float = -1,
+        return_imgs: bool = False,
+    ):
+        # TODO: implement case where this is false e.g. in SDXL Turbo
+        # do_classifier_free_guidance = True
+
+        iree_dtype = "float32" if self.precision == "fp32" else "float16"
+        torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
+
+        pipe_start = time.time()
+
+        max_length = self.max_length
+
+        samples = []
+        numpy_images = []
+
+
+        for i in range(batch_count):
+            generator = torch.random.manual_seed(seed + i)
+            rand_sample = torch.randn(
+                (
+                    self.batch_size,
+                    4,
+                    self.height // 8,
+                    self.width // 8,
+                ),
+                generator=generator,
+                dtype=torch_dtype,
+            )
+            samples.append(
+                ireert.asdevicearray(
+                    self.runners["unet"].config.device, rand_sample, dtype=iree_dtype
+                )
+            )
+
+        guidance_scale = ireert.asdevicearray(
+            self.runners["unet"].config.device,
+            np.asarray([guidance_scale]),
+            dtype=iree_dtype,
+        )
+
+        text_input_ids_list = []
+        uncond_input_ids_list = []
+
+        tokenize_start = time.time()
+
+        # Tokenize prompt and negative prompt.
+        for tokenizer in self.runners["tokenizers"]:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_input = tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            uncond_input_ids = uncond_input.input_ids
+
+            text_input_ids_list.extend(
+                [
+                    ireert.asdevicearray(
+                        self.runners["unet"].config.device, text_input_ids
+                    )
+                ]
+            )
+            uncond_input_ids_list.extend(
+                [
+                    ireert.asdevicearray(
+                        self.runners["unet"].config.device, uncond_input_ids
+                    )
+                ]
+            )
+        encode_prompts_start = time.time()
+
+        prompt_embeds, add_text_embeds = self.runners[
+            "clip"
+        ].ctx.modules.compiled_clip["encode_prompts"](
+            *text_input_ids_list, *uncond_input_ids_list
+        )
+
+        encode_prompts_end = time.time()
+
+        for i in range(batch_count):
+            unet_start = time.time()
+
+            sample, add_time_ids, timesteps = self.runners["scheduler"].ctx.modules.scheduler[
+                "init"
+            ](samples[i], guidance_scale)
+
+            for t in range(timesteps):
+                latents = self.runners["scheduler"].ctx.modules.scheduler["scale"](
+                    sample, t
+                )
+                latents = self.runners["unet"].ctx.modules.compiled_unet["main"](
+                    latents, prompt_embeds, add_text_embeds, add_time_ids, guidance_scale, t
+                )
+                sample = self.runners["scheduler"].ctx.modules.scheduler["step"](
+                    sample, latents, t
+                )
+            
+            vae_start = time.time()
+            vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
+                sample
+            )
+
+            pipe_end = time.time()
+
+            image = vae_out.to_host()
+
+            numpy_images.append(image)
+            print("Batch #", i + 1, "\n")
+            print(
+                "UNet time(",
+                self.num_inference_steps,
+                "): ",
+                vae_start - unet_start,
+                "sec,",
+            )
+            print(
+                "Unet average step latency: ",
+                (vae_start - unet_start) / self.num_inference_steps,
+                "sec",
+            )
+            print("VAE time: ", pipe_end - vae_start, "sec")
+            print(
+                f"\nTotal time (txt2img, batch #{str(i+1)}): ",
+                (encode_prompts_end - encode_prompts_start)
+                + (pipe_end - unet_start),
+                "sec\n",
+            )
+        end = time.time()
+        print("Total CLIP time:", encode_prompts_end - encode_prompts_start, "sec")
+        print("Total tokenize time:", encode_prompts_start - tokenize_start, "sec")
+        print("Loading time: ", encode_prompts_start - pipe_start, "sec")
+        if batch_count > 1:
+            print(
+                f"Total inference time ({batch_count} batch(es)):",
+                end - encode_prompts_start,
+                "sec",
+            )
+        timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
+        images = []
+        for idx, image in enumerate(numpy_images):
+            image = torch.from_numpy(image).cpu().permute(0, 2, 3, 1).float().numpy()
+            image = numpy_to_pil_image(image)
+            images.append(image[0])
+        if return_imgs:
+            return images
+        for idx, image in enumerate(images):
+            img_path = "sdxl_output_" + timestamp + "_" + str(idx) + ".png"
+            image.save(img_path)
+            print(img_path, "saved")
+        return
+
+
+def numpy_to_pil_image(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+
+    return pil_images
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
+
+    mlirs = copy.deepcopy(SUBMODELS)
+    vmfbs = copy.deepcopy(SUBMODELS)
+    weights = copy.deepcopy(SUBMODELS)
+    ireec_flags = {
+        "clip": args.ireec_flags + args.clip_flags,
+        "scheduler": args.ireec_flags,
+        "unet": args.ireec_flags + args.unet_flags,
+        "vae_decode": args.ireec_flags + args.vae_flags,
+    }
+    if not args.pipeline_dir:
+        pipe_id_list = [
+            utils.create_safe_name(args.hf_model_name, args.iree_target_triple),
+            str(args.height),
+            str(args.width),
+            str(args.max_length),
+            args.precision,
+            args.device,
+        ]
+        args.pipeline_dir = os.path.join(
+            ".",
+            "_".join(pipe_id_list),
+        )
+    if args.input_mlir:
+        user_mlir_list = args.input_mlir.split(",")
+    else:
+        user_mlir_list = []
+    for submodel_id, mlir_path in zip(mlirs.keys(), user_mlir_list):
+        if submodel_id in mlir_path:
+            mlirs[submodel_id] = mlir_path
+    if not args.external_weights_dir and args.external_weights:
+        args.external_weights_dir = args.pipeline_dir
+
+    sd_pipe = SharkSDPipeline(
+        args.hf_model_name,
+        args.scheduler_id,
+        args.height,
+        args.width,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.num_inference_steps,
+        args.device,
+        args.iree_target_triple,
+        ireec_flags,
+        args.attn_spec,
+        args.decomp_attn,
+        args.pipeline_dir,
+        args.external_weights_dir,
+        args.external_weights,
+        args.vae_decomp_attn,
+    )
+    vmfbs, weights = sd_pipe.check_prepared(mlirs, vmfbs, weights)
+    sd_pipe.load_pipeline(vmfbs, weights, args.rt_device, args.compiled_pipeline)
+    sd_pipe.generate_images(
+        args.prompt,
+        args.negative_prompt,
+        args.batch_count,
+        args.guidance_scale,
+        args.seed,
+        False,
+    )
+    print("Image generation complete.")

From b7a3f7bb2ae681292dff540c016f578b71e87c2f Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 1 May 2024 11:01:40 -0500
Subject: [PATCH 047/174] (WIP): More refactoring, setup for split schedulers
 implementation

---
 .../custom_models/sd_inference/schedulers.py  | 253 +++++++--------
 .../custom_models/sd_inference/sd_cmd_opts.py | 289 ++++++++++++++++++
 .../custom_models/sd_inference/sd_pipeline.py |  66 ++--
 .../sdxl_inference/sdxl_schedulers.py         | 197 ------------
 4 files changed, 441 insertions(+), 364 deletions(-)
 create mode 100644 models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
 delete mode 100644 models/turbine_models/custom_models/sdxl_inference/sdxl_schedulers.py

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index c7af11bc5..7990ae6ed 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -5,188 +5,153 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
-import sys
+from typing import List
 
 import torch
-from torch.fx.experimental.proxy_tensor import make_fx
 from shark_turbine.aot import *
-from iree import runtime as ireert
-import iree.compiler as ireec
 from iree.compiler.ir import Context
 import numpy as np
 
-from turbine_models.custom_models.sd_inference import utils
-from diffusers import (
-    UNet2DConditionModel,
-)
-
-import safetensors
-import argparse
-
 from turbine_models.turbine_tank import turbine_tank
+from turbine_models.custom_models.sd_inference import utils
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--hf_auth_token", type=str, help="The Hugging Face auth token, required"
-)
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="CompVis/stable-diffusion-v1-4",
-)
-parser.add_argument(
-    "--scheduler_id",
-    type=str,
-    help="Scheduler ID",
-    default="PNDM",
-)
-parser.add_argument(
-    "--num_inference_steps", type=int, default=50, help="Number of inference steps"
-)
-parser.add_argument(
-    "--batch_size", type=int, default=1, help="Batch size for inference"
-)
-parser.add_argument(
-    "--height", type=int, default=512, help="Height of Stable Diffusion"
-)
-parser.add_argument("--width", type=int, default=512, help="Width of Stable Diffusion")
-parser.add_argument("--compile_to", type=str, help="torch, linalg, vmfb")
-parser.add_argument("--external_weight_path", type=str, default="")
-parser.add_argument(
-    "--external_weights",
-    type=str,
-    default=None,
-    help="saves ir/vmfb without global weights for size and readability, options [safetensors]",
-)
-parser.add_argument("--device", type=str, default="cpu", help="cpu, cuda, vulkan, rocm")
-# TODO: Bring in detection for target triple
-parser.add_argument(
-    "--iree_target_triple",
-    type=str,
-    default="",
-    help="Specify vulkan target triple or rocm/cuda target device.",
-)
-parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-
-
-class Scheduler(torch.nn.Module):
-    def __init__(self, hf_model_name, num_inference_steps, scheduler):
-        super().__init__()
-        self.scheduler = scheduler
-        self.scheduler.set_timesteps(num_inference_steps)
-        self.unet = UNet2DConditionModel.from_pretrained(
-            hf_model_name,
-            subfolder="unet",
-        )
-        self.guidance_scale = 7.5
-
-    def forward(self, latents, encoder_hidden_states) -> torch.FloatTensor:
-        latents = latents * self.scheduler.init_noise_sigma
-        for t in self.scheduler.timesteps:
-            latent_model_input = torch.cat([latents] * 2)
-            t = t.unsqueeze(0)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, timestep=t
-            )
-            unet_out = self.unet.forward(
-                latent_model_input, t, encoder_hidden_states, return_dict=False
-            )[0]
-            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
-            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-        return latents
+class SchedulingModel(torch.nn.Module):
+    def __init__(self, scheduler, height, width):
+        self.model = scheduler
+        self.height = height
+        self.width = width
+
+    def initialize(self, sample):
+        height = sample.shape[-2] * 8
+        width = sample.shape[-1] * 8
+        original_size = (height, width)
+        target_size = (height, width)
+        crops_coords_top_left = (0, 0)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+        add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+        add_time_ids = add_time_ids.repeat(sample.shape[0], 1).type(self.dtype)
+        timesteps = self.scheduler.timesteps
+        step_indexes = torch.tensor(len(timesteps))
+        sample = sample * self.scheduler.init_noise_sigma
+        return sample.type(self.dtype), add_time_ids, step_indexes
 
 
 def export_scheduler(
-    scheduler,
-    hf_model_name,
-    batch_size,
-    height,
-    width,
-    hf_auth_token=None,
-    compile_to="torch",
-    external_weights=None,
-    external_weight_path=None,
-    device=None,
-    target_triple=None,
-    max_alloc=None,
+    hf_model_name: str,
+    scheduler_id: str,
+    batch_size: int = 1,
+    height: int = 512,
+    width: int = 512,
+    num_inference_steps: int = 30,
+    precision: str = "fp16",
+    compile_to: str = "torch",
+    device: str = None,
+    target_triple: str = None,
+    ireec_flags: str = None,
+    exit_on_vmfb: bool = False,
+    pipeline_dir: str = None,
+    input_mlir: str = None,
     upload_ir=False,
 ):
-    mapper = {}
-    utils.save_external_weights(
-        mapper, scheduler, external_weights, external_weight_path
+    schedulers = utils.get_schedulers(hf_model_name)
+    scheduler = schedulers[scheduler_id]
+    scheduler_module = SchedulingModel(
+        hf_model_name, scheduler
     )
+    vmfb_name = (
+        scheduler_id
+        + "_"
+        + f"{height}x{width}"
+        + "_"
+        + precision
+        + "_"
+        + str(num_inference_steps),
+        + "_"
+        + target_triple
+    )
+    if pipeline_dir:
+        safe_name = os.path.join(
+            pipeline_dir, vmfb_name
+        )
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name, vmfb_name
+        )
 
-    encoder_hidden_states_sizes = (2, 77, 768)
-    if hf_model_name == "stabilityai/stable-diffusion-2-1-base":
-        encoder_hidden_states_sizes = (2, 77, 1024)
-
-    sample = (batch_size, 4, height // 8, width // 8)
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+        )
+        return vmfb_path
+   
+    dtype = torch.float16 if precision == "fp16" else torch.float32
+
+    if precision == "fp16":
+        scheduled_unet_model = scheduled_unet_model.half()
+
+    sample = (
+        batch_size,
+        4,
+        height // 8,
+        width // 8,
+    )
 
     class CompiledScheduler(CompiledModule):
-        if external_weights:
-            params = export_parameters(
-                scheduler, external=True, external_scope="", name_mapper=mapper.get
-            )
-        else:
-            params = export_parameters(scheduler)
-
-        def main(
+        params = export_parameters(scheduled_unet_model)
+
+        def run_initialize(
             self,
-            sample=AbstractTensor(*sample, dtype=torch.float32),
-            encoder_hidden_states=AbstractTensor(
-                *encoder_hidden_states_sizes, dtype=torch.float32
-            ),
+            sample=AbstractTensor(*sample, dtype=dtype),
         ):
-            return jittable(scheduler.forward)(sample, encoder_hidden_states)
+            return jittable(scheduler_module.initialize)(sample)
 
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
     inst = CompiledScheduler(context=Context(), import_to=import_to)
 
     module_str = str(CompiledModule.get_mlir_module(inst))
-    safe_name = utils.create_safe_name(hf_model_name, "-scheduler")
-    if upload_ir:
-        with open(f"{safe_name}.mlir", "w+") as f:
-            f.write(module_str)
-        model_name_upload = hf_model_name.replace("/", "-")
-        model_name_upload = model_name_upload + "_scheduler"
-        blob_name = turbine_tank.uploadToBlobStorage(
-            str(os.path.abspath(f"{safe_name}.mlir")),
-            f"{model_name_upload}/{model_name_upload}.mlir",
-        )
+
     if compile_to != "vmfb":
         return module_str
-    else:
-        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
-        if upload_ir:
-            return blob_name
+    elif compile_to == "vmfb":
+        vmfb = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=True,
+        )
+        if exit_on_vmfb:
+            exit()
+        return vmfb
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
-    schedulers = utils.get_schedulers(args.hf_model_name)
-    scheduler = schedulers[args.scheduler_id]
-    scheduler_module = Scheduler(
-        args.hf_model_name, args.num_inference_steps, scheduler
-    )
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
     mod_str = export_scheduler(
-        scheduler_module,
         args.hf_model_name,
+        args.scheduler_id,
         args.batch_size,
         args.height,
         args.width,
-        args.hf_auth_token,
+        args.num_inference_steps,
+        args.precision,
         args.compile_to,
-        args.external_weights,
-        args.external_weight_path,
         args.device,
         args.iree_target_triple,
-        args.vulkan_max_allocation,
+        args.ireec_flags,
+        exit_on_vmfb=False,
+        input_mlir=args.input_mlir,
     )
-    safe_name = utils.create_safe_name(args.hf_model_name, "-scheduler")
+    safe_name = utils.create_safe_name(args.hf_model_name, "_" + args.scheduler_id + "_" + str(args.num_inference_steps))
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
     print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
new file mode 100644
index 000000000..9b1a41767
--- /dev/null
+++ b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
@@ -0,0 +1,289 @@
+import argparse
+import os
+from pathlib import Path
+
+
+def path_expand(s):
+    return Path(s).expanduser().resolve()
+
+
+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
+# Note: this is where command-line options for the scripts in this directory
+# are defined along with their defaults. Thus, they should not be referenced
+# within modelling or inference code, only at the entry point to the script.
+
+# We should consider separating out the options that are "model configs" from
+# the options that control the compiler, runtime, and script behavior,
+# when applicable, as the formermost would best be kept in a separate
+# config or imported from huggingface.
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+# SDXL Huggingface Options
+##############################################################################
+
+p.add_argument(
+    "--hf_auth_token",
+    type=str,
+    help="The Hugging Face auth token, if required",
+    default=None,
+)
+p.add_argument(
+    "--hf_model_name",
+    type=str,
+    help="HF model name",
+    default="stabilityai/stable-diffusion-xl-base-1.0",
+)
+p.add_argument(
+    "--scheduler_id",
+    type=str,
+    help="Scheduler ID",
+    default="PNDM",
+)
+
+##############################################################################
+# SDXL Inference Options
+#    These options are used to control runtime parameters for SDXL inference.
+##############################################################################
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default=" a cat under the snow with blue eyes, covered by snow, cinematic style, medium shot, professional photo, animal",
+    help="Prompt input to stable diffusion.",
+)
+
+p.add_argument(
+    "--negative_prompt",
+    type=str,
+    default="Watermark, blurry, oversaturated, low resolution, pollution",
+    help="Negative prompt input to stable diffusion.",
+)
+
+p.add_argument(
+    "--num_inference_steps", type=int, default=30, help="Number of UNet inference steps"
+)
+
+p.add_argument(
+    "--batch_count",
+    type=int,
+    default=1,
+    help="Number of batches to run for a single prompt",
+)
+
+p.add_argument(
+    "--guidance_scale",
+    type=float,
+    default=7.5,
+    help="Scale by which to adjust prompt guidance to the unconditional noise prediction output of UNet after each iteration.",
+)
+
+p.add_argument(
+    "--seed", type=float, default=0, help="Seed for random number/latents generation."
+)
+
+p.add_argument(
+    "--external_weight_path",
+    type=str,
+    default="",
+    help="Path to external weights file, for jobs with one weights filepath. When importing, this is used to specify where to save the model weights, and at runtime, this is used to specify where to load the model weights from.",
+)
+
+p.add_argument(
+    "--external_weights_dir",
+    type=str,
+    default="",
+    help="Directory containing external weights for a job that requires more than one weights file. When importing, this is used to specify where to save the model weights, and at runtime, this is used to specify where to load the model weights from. Files will then be saved according to the parameters that make them unique, i.e. <hf_model_name>_<precision>_<submodel>_<submodel-specific>.<external_weights>",
+)
+
+p.add_argument(
+    "--vmfb_path", type=str, default="", help="path to vmfb containing compiled module"
+)
+
+p.add_argument(
+    "--pipeline_vmfb_path",
+    type=str,
+    default="",
+    help="path to vmfb containing compiled meta-module",
+)
+
+p.add_argument(
+    "--external_weight_file",
+    type=str,
+    default=None,
+    help="Path to external weights, used in benchmark scripts.",
+)
+
+p.add_argument(
+    "--pipeline_dir",
+    type=str,
+    default=None,
+    help="Directory to save pipeline artifacts",
+)
+
+p.add_argument(
+    "--compiled_pipeline",
+    default=False,
+    action="store_true",
+    help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
+)
+
+##############################################################################
+# SDXL Modelling Options
+#    These options are used to control model defining parameters for SDXL.
+#    These are MLIR - changing variables! If you change them, you will need
+#    to import/download and recompile the model.
+##############################################################################
+
+p.add_argument("--batch_size", type=int, default=1, help="Batch size for inference")
+p.add_argument(
+    "--height", type=int, default=1024, help="Height of Stable Diffusion output image."
+)
+p.add_argument(
+    "--width", type=int, default=1024, help="Width of Stable Diffusion output image"
+)
+p.add_argument(
+    "--precision",
+    type=str,
+    default="fp16",
+    help="Precision of Stable Diffusion weights and graph.",
+)
+p.add_argument(
+    "--max_length", type=int, default=64, help="Sequence Length of Stable Diffusion"
+)
+p.add_argument("--vae_variant", type=str, default="decode", help="encode, decode")
+p.add_argument(
+    "--return_index",
+    action="store_true",
+    help="Make scheduled unet compiled module return the step index.",
+)
+
+p.add_argument(
+    "--vae_decomp_attn",
+    type=bool,
+    default=False,
+    help="Decompose attention for VAE decode only at fx graph level",
+)
+
+##############################################################################
+# SDXL script general options.
+##############################################################################
+
+p.add_argument("--compile_to", type=str, default="mlir", help="torch, linalg, vmfb")
+
+p.add_argument(
+    "--external_weights",
+    type=str,
+    default=None,
+    choices=["safetensors", "irpa", "gguf", None],
+    help="Externalizes model weights from the torch dialect IR and its successors",
+)
+
+# See --external_weight_path and --external_weight_dir to specify where to save the model weights.
+
+p.add_argument(
+    "--compare_vs_torch",
+    action="store_true",
+    help="Runs both turbine vmfb and a torch model to compare results",
+)
+p.add_argument(
+    "--decomp_attn",
+    default=False,
+    action="store_true",
+    help="Decompose attention at fx graph level",
+)
+p.add_argument(
+    "--exit_on_vmfb",
+    default=True,
+    action="store_false",
+    help="Exit program on vmfb compilation completion. Most scripts will also save .mlir if this is disabled.",
+)
+p.add_argument(
+    "--input_mlir",
+    type=str,
+    default=None,
+    help="Path to input mlir file to compile. Comma-separate paths to provide more than one input to pipelines.",
+)
+p.add_argument(
+    "--download_mlir",
+    default=False,
+    action="store_true",
+    help="Download missing mlir files from Azure storage.",
+)
+p.add_argument(
+    "--container_name",
+    type=str,
+    default=None,
+    help="Azure storage container name to download mlir files from.",
+)
+
+
+##############################################################################
+# IREE Compiler Options
+##############################################################################
+
+p.add_argument("--device", type=str, default="cpu", help="cpu, cuda, vulkan, rocm")
+
+p.add_argument(
+    "--rt_device",
+    type=str,
+    default="local-task",
+    help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
+)
+
+# TODO: Bring in detection for target triple
+p.add_argument(
+    "--iree_target_triple",
+    type=str,
+    default="",
+    help="Specify vulkan target triple or rocm/cuda target device.",
+)
+
+p.add_argument("--ireec_flags", type=str, default="", help="extra iree-compile options")
+
+p.add_argument(
+    "--attn_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options for models with iree_linalg_ext.attention ops.",
+)
+
+p.add_argument(
+    "--attn_spec",
+    type=str,
+    default=None,
+    help="extra iree-compile options for models with iree_linalg_ext.attention ops. Set this to 'default' if you are using mfma-capable hardware with ROCM.",
+)
+
+p.add_argument(
+    "--clip_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options to send for compiling CLIP/prompt_encoder. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
+)
+
+p.add_argument(
+    "--vae_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options to send for compiling VAE. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
+)
+
+p.add_argument(
+    "--unet_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options to send for compiling unet. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
+)
+
+
+args, unknown_args = p.parse_known_args()
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 647b38d56..93e55a9fa 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -12,7 +12,8 @@
     clip,
     unet,
     vae,
-    utils
+    schedulers,
+    utils,
 )
 from turbine_models.model_runner import vmfbRunner
 from transformers import CLIPTokenizer
@@ -236,11 +237,48 @@ def export_submodel(
         if weights_only:
             input_mlir = copy.deepcopy(SUBMODELS)
         match submodel:
-            case "scheduled_unet":
+            case "clip":
+                _, clip_vmfb = clip.export_combined_clip(
+                    self.hf_model_name,
+                    None,
+                    self.max_length,
+                    self.precision,
+                    "vmfb",
+                    self.external_weights,
+                    clip_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["clip"],
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    input_mlir=input_mlir["clip"],
+                    attn_spec=self.attn_spec,
+                    weights_only=weights_only,
+                )
+                return clip_vmfb, clip_external_weight_path
+            case "scheduler":
+                scheduler_vmfb = schedulers.export_scheduler(
+                    self.hf_model_name,
+                    self.scheduler_id,
+                    self.batch_size,
+                    self.height,
+                    self.width,
+                    self.num_inference_steps,
+                    self.precision,
+                    "vmfb",
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["scheduler"],
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    input_mlir=input_mlir["scheduler"],
+                )
+                return scheduler_vmfb, None
+            case "unet":
                 if input_mlir[submodel]:
                     unet_torch = None
                 else:
-                    unet_torch = self.get_torch_models("scheduled_unet")
+                    unet_torch = self.get_torch_models("unet")
                     
                 unet_vmfb = unet.export_unet_model(
                     unet_torch,
@@ -292,25 +330,7 @@ def export_submodel(
                     weights_only=weights_only,
                 )
                 return vae_decode_vmfb, vae_external_weight_path
-            case "clip":
-                _, clip_vmfb = clip.export_combined_clip(
-                    self.hf_model_name,
-                    None,
-                    self.max_length,
-                    self.precision,
-                    "vmfb",
-                    self.external_weights,
-                    clip_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
-                    self.ireec_flags["clip"],
-                    exit_on_vmfb=False,
-                    pipeline_dir=self.pipeline_dir,
-                    input_mlir=input_mlir["clip"],
-                    attn_spec=self.attn_spec,
-                    weights_only=weights_only,
-                )
-                return clip_vmfb, clip_external_weight_path
+
 
     # LOAD
 
@@ -452,7 +472,7 @@ def generate_images(
 
             sample, add_time_ids, timesteps = self.runners["scheduler"].ctx.modules.scheduler[
                 "init"
-            ](samples[i], guidance_scale)
+            ](samples[i])
 
             for t in range(timesteps):
                 latents = self.runners["scheduler"].ctx.modules.scheduler["scale"](
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_schedulers.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_schedulers.py
deleted file mode 100644
index a3ae29595..000000000
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_schedulers.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2023 Nod Labs, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# from @aviator19941's gist : https://gist.github.com/aviator19941/4e7967bd1787c83ee389a22637c6eea7
-
-import os
-import sys
-
-from iree import runtime as ireert
-from iree.compiler.ir import Context
-import numpy as np
-from shark_turbine.aot import *
-from turbine_models.custom_models.sd_inference import utils
-import torch
-import torch._dynamo as dynamo
-from diffusers import UNet2DConditionModel
-from shark_turbine.dynamo.passes import (
-    DEFAULT_DECOMPOSITIONS,
-)
-
-import safetensors
-
-
-class SDXLScheduler(torch.nn.Module):
-    def __init__(
-        self,
-        hf_model_name,
-        num_inference_steps,
-        scheduler,
-        hf_auth_token=None,
-        precision="fp32",
-    ):
-        super().__init__()
-        self.scheduler = scheduler
-        self.scheduler.set_timesteps(num_inference_steps)
-        self.guidance_scale = 7.5
-        if precision == "fp16":
-            try:
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    hf_model_name,
-                    subfolder="unet",
-                    auth_token=hf_auth_token,
-                    low_cpu_mem_usage=False,
-                    variant="fp16",
-                )
-            except:
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    hf_model_name,
-                    subfolder="unet",
-                    auth_token=hf_auth_token,
-                    low_cpu_mem_usage=False,
-                )
-        else:
-            self.unet = UNet2DConditionModel.from_pretrained(
-                hf_model_name,
-                subfolder="unet",
-                auth_token=hf_auth_token,
-                low_cpu_mem_usage=False,
-            )
-
-    def forward(self, sample, prompt_embeds, text_embeds, time_ids):
-        sample = sample * self.scheduler.init_noise_sigma
-        for t in self.scheduler.timesteps:
-            with torch.no_grad():
-                added_cond_kwargs = {
-                    "text_embeds": text_embeds,
-                    "time_ids": time_ids,
-                }
-                latent_model_input = torch.cat([sample] * 2)
-                t = t.unsqueeze(0)
-                # print('UNSQUEEZE T:', t)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, timestep=t
-                )
-                noise_pred = self.unet.forward(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=None,
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=False,
-                )[0]
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + self.guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
-                sample = self.scheduler.step(noise_pred, t, sample, return_dict=False)[
-                    0
-                ]
-        return sample
-
-
-def export_scheduler(
-    scheduler,
-    hf_model_name,
-    batch_size,
-    height,
-    width,
-    hf_auth_token=None,
-    compile_to="torch",
-    external_weights=None,
-    external_weight_path=None,
-    device=None,
-    target_triple=None,
-    ireec_flags=None,
-):
-    mapper = {}
-    utils.save_external_weights(
-        mapper, scheduler, external_weights, external_weight_path
-    )
-
-    decomp_list = DEFAULT_DECOMPOSITIONS
-
-    decomp_list.extend(
-        [
-            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-            torch.ops.aten._scaled_dot_product_flash_attention.default,
-        ]
-    )
-    # tensor shapes for tracing
-    sample = (batch_size, 4, height // 8, width // 8)
-    prompt_embeds = (2, 77, 2048)
-    text_embeds = (2, 1280)
-    time_ids = (2, 6)
-
-    class CompiledScheduler(CompiledModule):
-        if external_weights:
-            params = export_parameters(
-                scheduler, external=True, external_scope="", name_mapper=mapper.get
-            )
-        else:
-            params = export_parameters(scheduler)
-
-        def main(
-            self,
-            sample=AbstractTensor(*sample, dtype=torch.float32),
-            prompt_embeds=AbstractTensor(*prompt_embeds, dtype=torch.float32),
-            text_embeds=AbstractTensor(*text_embeds, dtype=torch.float32),
-            time_ids=AbstractTensor(*time_ids, dtype=torch.float32),
-        ):
-            return jittable(scheduler.forward, decompose_ops=decomp_list)(
-                sample, prompt_embeds, text_embeds, time_ids
-            )
-
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledScheduler(context=Context(), import_to=import_to)
-
-    module_str = str(CompiledModule.get_mlir_module(inst))
-
-    safe_name = utils.create_safe_name(hf_model_name, "-scheduler")
-    with open(f"{safe_name}.mlir", "w+") as f:
-        f.write(module_str)
-    print("Saved to", safe_name + ".mlir")
-
-    if compile_to != "vmfb":
-        return module_str
-    else:
-        utils.compile_to_vmfb(module_str, device, target_triple, ireec_flags, safe_name)
-
-
-if __name__ == "__main__":
-    from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
-
-    hf_model_name = "stabilityai/stable-diffusion-xl-base-1.0"
-    schedulers = utils.get_schedulers(args.hf_model_name)
-    scheduler = schedulers[args.scheduler_id]
-    scheduler_module = SDXLScheduler(
-        args.hf_model_name,
-        args.num_inference_steps,
-        scheduler,
-        hf_auth_token=None,
-        precision=args.precision,
-    )
-
-    print("export scheduler begin")
-    mod_str = export_scheduler(
-        scheduler_module,
-        args.hf_model_name,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.hf_auth_token,
-        args.compile_to,
-        args.external_weights,
-        args.external_weight_path,
-        args.device,
-        args.iree_target_triple,
-        args.ireec_flags,
-    )
-    print("export scheduler complete")
-    safe_name = utils.create_safe_name(args.hf_model_name, "-scheduler")
-    with open(f"{safe_name}.mlir", "w+") as f:
-        f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")

From 3df8532a1a4db40affa832c6b85ce59fe12878e7 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 1 May 2024 16:15:43 -0500
Subject: [PATCH 048/174] Update test_models.yml

---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index 0c81e2409..9d7e7aab6 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -50,7 +50,7 @@ jobs:
           # from non default locations first. Installing the PyTorch CPU
           # wheels saves multiple minutes and a lot of bandwidth on runner setup.
           pip install --no-compile -r ${{ github.workspace }}/iree-turbine/pytorch-cpu-requirements.txt
-          pip install --no-compile --pre --upgrade -r ${{ github.workspace }}/iree-turbine/requirements.txt
+          pip install --pre --upgrade -r ${{ github.workspace }}/iree-turbine/requirements.txt
           pip install --no-compile --pre -e ${{ github.workspace }}/iree-turbine[testing]
           pip install --upgrade --pre --no-cache-dir iree-compiler iree-runtime -f https://iree.dev/pip-release-links.html
           pip install --no-compile --pre --upgrade -e models -r models/requirements.txt

From bd851e3b9b7a1dacb9c721a48ed19e342e179dc5 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 1 May 2024 16:16:48 -0500
Subject: [PATCH 049/174] Update requirements.txt

---
 models/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index ed2a0b0c1..07f7bdfcd 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -1,6 +1,6 @@
 protobuf
 sentencepiece
-shark_turbine
+shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 transformers==4.37.1
 accelerate
 diffusers @ git+https://github.com/nod-ai/diffusers@v0.24.0-release

From 66e19f6ed216a399b9cb0559670fb3928b570131 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 1 May 2024 18:07:00 -0500
Subject: [PATCH 050/174] Try only pinning brevitas in requirements.

---
 models/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/setup.py b/models/setup.py
index fae7c4a61..5b1b56915 100644
--- a/models/setup.py
+++ b/models/setup.py
@@ -55,7 +55,7 @@ def load_version_info():
     ),
     install_requires=[
         "Shark-Turbine",
-        "brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b",
+        "brevitas",
         "protobuf",
         "sentencepiece",
         "transformers==4.37.1",

From aab85d5ba7fb9e6fe0b7d247b8fcb48bc6b79d3a Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Fri, 3 May 2024 13:30:43 -0500
Subject: [PATCH 051/174] DNM: remove brevitas from models requirements

---
 models/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index 07f7bdfcd..b3f01e4aa 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -4,7 +4,6 @@ shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 transformers==4.37.1
 accelerate
 diffusers @ git+https://github.com/nod-ai/diffusers@v0.24.0-release
-brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
 azure-storage-blob
 # microsoft/phi model

From e449bb2bc06f3d7ab17787c9dac889a4e6021824 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Fri, 3 May 2024 13:31:20 -0500
Subject: [PATCH 052/174] DNM: Remove brevitas from setup.py

---
 models/setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/setup.py b/models/setup.py
index 5b1b56915..e051b665e 100644
--- a/models/setup.py
+++ b/models/setup.py
@@ -55,7 +55,6 @@ def load_version_info():
     ),
     install_requires=[
         "Shark-Turbine",
-        "brevitas",
         "protobuf",
         "sentencepiece",
         "transformers==4.37.1",

From 49b1937b6d2b32166788c5d0f3a0cf539b783772 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Fri, 3 May 2024 14:32:03 -0500
Subject: [PATCH 053/174] Update utils.py

---
 .../custom_models/sd_inference/utils.py            | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index c7c134885..ba0ce3f25 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -17,14 +17,20 @@
         "--iree-vm-target-truncate-unsupported-floats",
         "--iree-llvmgpu-enable-prefetch=true",
         "--iree-opt-data-tiling=false",
-        "--iree-global-opt-enable-fuse-horizontal-contractions=true",
-        "--iree-opt-aggressively-propagate-transposes=true",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
-    "unet": ["--iree-flow-enable-aggressive-fusion"],
-    "clip": [],
+    "unet": [
+        "--iree-flow-enable-aggressive-fusion",
+        "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+        "--iree-opt-aggressively-propagate-transposes=true",
+    ],
+    "clip": [
+        "--iree-flow-enable-aggressive-fusion",
+        "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+        "--iree-opt-aggressively-propagate-transposes=true",
+    ],
     "vae": ["--iree-flow-enable-aggressive-fusion"],
 }
 

From cd44eec4d5a3962dca15646dbf02778226009f67 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 8 May 2024 16:43:35 -0500
Subject: [PATCH 054/174] use np.testing.assert_allclose in unet_runner

---
 .../custom_models/sdxl_inference/unet_runner.py             | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index 197d850a9..a4c2f812a 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -158,9 +158,9 @@ def run_torch_unet(
             # precision="fp16",
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
-        err = utils.largest_error(torch_output, turbine_output)
-        print("Largest Error: ", err)
-        assert err < 9e-3
+        atol=7e-2
+        rtol=1e-4
+        np.testing.assert_allclose(turbine_output, torch_output, atol=atol, rtol=rtol
 
     # TODO: Figure out why we occasionally segfault without unlinking output variables
     turbine_output = None

From ef53b08afc11450b02cd084d579b80df3c64a61a Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 8 May 2024 16:44:45 -0500
Subject: [PATCH 055/174] fixup typo.

---
 .../turbine_models/custom_models/sdxl_inference/unet_runner.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index a4c2f812a..e307a2ea8 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -160,7 +160,7 @@ def run_torch_unet(
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
         atol=7e-2
         rtol=1e-4
-        np.testing.assert_allclose(turbine_output, torch_output, atol=atol, rtol=rtol
+        np.testing.assert_allclose(turbine_output, torch_output, atol=atol, rtol=rtol)
 
     # TODO: Figure out why we occasionally segfault without unlinking output variables
     turbine_output = None

From c85b00a81e0fa912065b0d28e485a55454cddc63 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 8 May 2024 16:47:10 -0500
Subject: [PATCH 056/174] update tolerances

---
 .../custom_models/sdxl_inference/unet_runner.py               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index e307a2ea8..aca0cb745 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -158,8 +158,8 @@ def run_torch_unet(
             # precision="fp16",
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
-        atol=7e-2
-        rtol=1e-4
+        atol=4e-2
+        rtol=4e-1
         np.testing.assert_allclose(turbine_output, torch_output, atol=atol, rtol=rtol)
 
     # TODO: Figure out why we occasionally segfault without unlinking output variables

From 25fd6c4bb89a6b8e23704af4c7e86112fc537667 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Sat, 11 May 2024 00:35:35 -0500
Subject: [PATCH 057/174] formatting

---
 .../custom_models/sdxl_inference/unet_runner.py               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index aca0cb745..60cc206f1 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -158,8 +158,8 @@ def run_torch_unet(
             # precision="fp16",
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
-        atol=4e-2
-        rtol=4e-1
+        atol = 4e-2
+        rtol = 4e-1
         np.testing.assert_allclose(turbine_output, torch_output, atol=atol, rtol=rtol)
 
     # TODO: Figure out why we occasionally segfault without unlinking output variables

From 7f84246ca9133adeccf2f7978f0b8faaf94140a4 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Sat, 11 May 2024 00:40:06 -0500
Subject: [PATCH 058/174] Enable multiple batch size test.

---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index 9d7e7aab6..184b1458f 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -72,4 +72,4 @@ jobs:
           pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 5
           pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
           pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
-
+          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default --batch_size 2

From 12ca4c62ccfcfac6ec4e33227957f63f29e5a540 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Sun, 12 May 2024 12:04:10 -0500
Subject: [PATCH 059/174] Simplify test_shark setup.

---
 .github/workflows/test_shark.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test_shark.yml b/.github/workflows/test_shark.yml
index a60a098bd..301376a47 100644
--- a/.github/workflows/test_shark.yml
+++ b/.github/workflows/test_shark.yml
@@ -49,7 +49,6 @@ jobs:
           cd $GITHUB_WORKSPACE/SHARK
           python${{ matrix.version }} -m venv shark.venv
           source shark.venv/bin/activate
-          sed -i 's/SHARK-Turbine#/SHARK-Turbine.git@${{github.sha}}#/g' requirements.txt
           pip install -r requirements.txt --no-cache-dir
           pip install -e .
           python apps/shark_studio/tests/api_test.py

From af3fc0c7c08d76ca35e2c1b703a90ec2fbf044a5 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Mon, 13 May 2024 11:35:05 -0500
Subject: [PATCH 060/174] Update requirements.txt

---
 models/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/requirements.txt b/models/requirements.txt
index b3f01e4aa..899a016f8 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -8,3 +8,4 @@ diffusers @ git+https://github.com/nod-ai/diffusers@v0.24.0-release
 azure-storage-blob
 # microsoft/phi model
 einops
+pytest

From 82363c589595600fe4381bb4a72b786e3d9204b5 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 13 May 2024 13:44:07 -0500
Subject: [PATCH 061/174] Fix batch size configurability for submodels.

---
 .../turbine_models/custom_models/sdxl_inference/clip.py  | 9 ++++++---
 .../custom_models/sdxl_inference/sdxl_prompt_encoder.py  | 6 ++++--
 .../custom_models/sdxl_inference/sdxl_scheduled_unet.py  | 6 +++---
 .../turbine_models/custom_models/sdxl_inference/unet.py  | 4 ++--
 .../turbine_models/custom_models/sdxl_inference/vae.py   | 4 ++--
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/clip.py b/models/turbine_models/custom_models/sdxl_inference/clip.py
index 20b0aa7ae..c5e583f86 100644
--- a/models/turbine_models/custom_models/sdxl_inference/clip.py
+++ b/models/turbine_models/custom_models/sdxl_inference/clip.py
@@ -48,6 +48,7 @@ def forward(self, input):
 def export_clip_model(
     hf_model_name,
     hf_auth_token=None,
+    batch_size=1,
     max_length=77,
     precision="fp16",
     compile_to="torch",
@@ -67,7 +68,7 @@ def export_clip_model(
         safe_name = os.path.join(pipeline_dir, "clip_" + str(index))
     else:
         safe_name = utils.create_safe_name(
-            hf_model_name, f"-{str(max_length)}-{precision}-clip-{index}-{device}"
+            hf_model_name, f"_bs{str(batch_size)}-{str(max_length)}-{precision}-clip-{index}-{device}"
         )
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -160,6 +161,7 @@ def main(self, inp=AbstractTensor(1, max_length, dtype=torch.int64)):
     mod_1_str, _ = export_clip_model(
         args.hf_model_name,
         args.hf_auth_token,
+        args.batch_size,
         args.max_length,
         args.precision,
         args.compile_to,
@@ -177,6 +179,7 @@ def main(self, inp=AbstractTensor(1, max_length, dtype=torch.int64)):
     mod_2_str, _ = export_clip_model(
         args.hf_model_name,
         args.hf_auth_token,
+        args.batch_size,
         args.max_length,
         args.precision,
         args.compile_to,
@@ -194,10 +197,10 @@ def main(self, inp=AbstractTensor(1, max_length, dtype=torch.int64)):
     if args.input_mlir:
         exit()
     safe_name_1 = safe_name = utils.create_safe_name(
-        args.hf_model_name, f"_{str(args.max_length)}_{args.precision}_clip_1"
+        args.hf_model_name, f"_bs{str(args.batch_size)}_{str(args.max_length)}_{args.precision}_clip_1"
     )
     safe_name_2 = safe_name = utils.create_safe_name(
-        args.hf_model_name, f"_{str(args.max_length)}_{args.precision}_clip_2"
+        args.hf_model_name, f"_bs{str(args.batch_size)}_{str(args.max_length)}_{args.precision}_clip_2"
     )
     with open(f"{safe_name_1}.mlir", "w+") as f:
         f.write(mod_1_str)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 1f56031ed..24bbbdf0f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -102,6 +102,7 @@ def forward(
 def export_prompt_encoder(
     hf_model_name,
     hf_auth_token=None,
+    batch_size=1,
     max_length=64,
     precision="fp16",
     compile_to="torch",
@@ -124,7 +125,7 @@ def export_prompt_encoder(
         safe_name = os.path.join(pipeline_dir, "prompt_encoder")
     else:
         safe_name = utils.create_safe_name(
-            hf_model_name, f"-{str(max_length)}-{precision}-prompt-encoder-{device}"
+            hf_model_name, f"-bs{batch_size}-{str(max_length)}-{precision}-prompt-encoder-{device}"
         )
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -216,6 +217,7 @@ def encode_prompts(
     mod_str, _ = export_prompt_encoder(
         args.hf_model_name,
         args.hf_auth_token,
+        args.batch_size,
         args.max_length,
         args.precision,
         args.compile_to,
@@ -232,7 +234,7 @@ def encode_prompts(
     if args.input_mlir:
         exit()
     safe_name_1 = safe_name = utils.create_safe_name(
-        args.hf_model_name, f"_{str(args.max_length)}_{args.precision}_prompt_encoder"
+        args.hf_model_name, f"_bs{str(args.batch_size)}_{str(args.max_length)}_{args.precision}_prompt_encoder"
     )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 36f92f8ae..a0f3e0390 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -148,7 +148,7 @@ def export_scheduled_unet_model(
     else:
         safe_name = utils.create_safe_name(
             hf_model_name,
-            f"_{max_length}_{height}x{width}_{precision}_scheduled_unet_{device}",
+            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_scheduled_unet_{device}",
         )
 
     if input_mlir:
@@ -307,7 +307,7 @@ def export_pipeline_module(args):
             args.num_inference_steps,
             args.return_index,
         )
-    if args.compile_to == "vmfb":
+    if args.compile_to == "vmfb" and args.pipeline_dir is not None:
         pipeline_vmfb_path = export_pipeline_module(args)
     mod_str = export_scheduled_unet_model(
         scheduled_unet_model,
@@ -336,7 +336,7 @@ def export_pipeline_module(args):
         exit()
     safe_name = utils.create_safe_name(
         args.hf_model_name + "_" + args.scheduler_id,
-        f"_{args.max_length}_{args.height}x{args.width}_{args.precision}_unet_{str(args.num_inference_steps)}",
+        f"_bs{args.batch_size}_{args.max_length}_{args.height}x{args.width}_{args.precision}_unet_{str(args.num_inference_steps)}",
     )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index e59a0d79a..ca1781fa6 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -107,7 +107,7 @@ def export_unet_model(
         do_classifier_free_guidance = True
 
     safe_name = utils.create_safe_name(
-        hf_model_name, f"_{max_length}_{height}x{width}_{precision}_unet_{device}"
+        hf_model_name, f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}"
     )
 
     if input_mlir:
@@ -236,7 +236,7 @@ def main(
         exit()
     safe_name = utils.create_safe_name(
         args.hf_model_name,
-        f"_{args.max_length}_{args.height}x{args.width}_{args.precision}_unet",
+        f"_bs{args.batch_size}_{args.max_length}_{args.height}x{args.width}_{args.precision}_unet",
     )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index b5bb5225f..5f7726dc8 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -89,7 +89,7 @@ def export_vae_model(
         safe_name = os.path.join(pipeline_dir, "vae_" + variant)
     else:
         safe_name = utils.create_safe_name(
-            hf_model_name, f"_{height}x{width}_{precision}_vae_{variant}_{device}"
+            hf_model_name, f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}"
         )
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -196,7 +196,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
         exit()
     safe_name = utils.create_safe_name(
         args.hf_model_name,
-        f"_{args.height}x{args.width}_{args.precision}_vae_{args.vae_variant}",
+        f"_bs{str(args.batch_size)}_{args.height}x{args.width}_{args.precision}_vae_{args.vae_variant}",
     )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)

From f1f37e373ede41fc38ec898275192c06ff3bf916 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 16 May 2024 19:51:10 -0500
Subject: [PATCH 062/174] Revert change to clip export function signature

---
 .../custom_models/sdxl_inference/sdxl_prompt_encoder.py      | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 24bbbdf0f..6a4e6be43 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -102,7 +102,6 @@ def forward(
 def export_prompt_encoder(
     hf_model_name,
     hf_auth_token=None,
-    batch_size=1,
     max_length=64,
     precision="fp16",
     compile_to="torch",
@@ -125,7 +124,7 @@ def export_prompt_encoder(
         safe_name = os.path.join(pipeline_dir, "prompt_encoder")
     else:
         safe_name = utils.create_safe_name(
-            hf_model_name, f"-bs{batch_size}-{str(max_length)}-{precision}-prompt-encoder-{device}"
+            hf_model_name, f"{str(max_length)}-{precision}-prompt-encoder-{device}"
         )
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -234,7 +233,7 @@ def encode_prompts(
     if args.input_mlir:
         exit()
     safe_name_1 = safe_name = utils.create_safe_name(
-        args.hf_model_name, f"_bs{str(args.batch_size)}_{str(args.max_length)}_{args.precision}_prompt_encoder"
+        args.hf_model_name, f"{str(args.max_length)}_{args.precision}_prompt_encoder"
     )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)

From 86561e0a33dea8f528769a4fee30a38597d40361 Mon Sep 17 00:00:00 2001
From: ean garvey <ean.garvey@amd.com>
Date: Sun, 19 May 2024 02:20:22 -0400
Subject: [PATCH 063/174] TD spec fixes

---
 .../sd_inference/default_mfma_attn_spec.mlir  | 241 +++++++++++++++++-
 .../custom_models/sd_inference/utils.py       |   2 +-
 2 files changed, 240 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir b/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
index d5c93011d..e7e1d8bf5 100644
--- a/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
+++ b/models/turbine_models/custom_models/sd_inference/default_mfma_attn_spec.mlir
@@ -42,7 +42,7 @@ module attributes { transform.with_named_sequence } {
     // Tile batch dimensions of attention
     // ==========================================
     %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %top_level_func {
       transform.apply_patterns.canonicalization
@@ -242,7 +242,7 @@ module attributes { transform.with_named_sequence } {
     // Tile batch dimensions of attention
     // ==========================================
     %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %top_level_func {
       transform.apply_patterns.canonicalization
@@ -460,6 +460,231 @@ module attributes { transform.with_named_sequence } {
     transform.yield %attention : !transform.any_op
   }
 
+//===----------------------------------------------------------------------===//
+// Matmul tuning
+//===----------------------------------------------------------------------===//
+
+  transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
+    // transform.print %root {name = "Generic"} : !transform.any_op
+    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
+      ^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf32>):
+      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+                                            affine_map<(d0, d1, d2) -> (d1, d2)>,
+                                            affine_map<(d0, d1, d2) -> (d0, d1)>],
+                           iterator_types = ["parallel", "parallel", "reduction"]}
+          ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf32>) {
+        ^bb0(%in: f16, %in_0: f16, %acc: f32):
+          %8 = arith.extf %in : f16 to f32
+          %9 = arith.extf %in_0 : f16 to f32
+          %10 = arith.mulf %8, %9 : f32
+          %11 = arith.addf %acc, %10 : f32
+          linalg.yield %11 : f32
+        } -> tensor<?x?xf32>
+    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
+    transform.yield %root : !transform.any_op
+  }
+
+  transform.named_sequence @match_mmt_f16_f16_f16(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
+    // transform.print %root {name = "Generic"} : !transform.any_op
+    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
+      ^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf16>):
+      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+                                            affine_map<(d0, d1, d2) -> (d1, d2)>,
+                                            affine_map<(d0, d1, d2) -> (d0, d1)>],
+                           iterator_types = ["parallel", "parallel", "reduction"]}
+          ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf16>) {
+        ^bb0(%in: f16, %in_0: f16, %acc: f16):
+          %10 = arith.mulf %in, %in_0 : f16
+          %11 = arith.addf %acc, %10 : f16
+          linalg.yield %11 : f16
+        } -> tensor<?x?xf16>
+    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
+    transform.yield %root : !transform.any_op
+  }
+
+  transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly}, %config: !transform.any_param {transform.readonly}) {
+    transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param
+    // transform.print %op {name = "Applied"} : !transform.any_op
+    transform.yield
+  }
+
+  transform.named_sequence @match_mmt_2048x10240x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
+    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
+    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
+    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %lhs = tensor<2048x1280xf16> : !transform.any_value
+    transform.iree.match.cast_compatible_type %rhs = tensor<10240x1280xf16> : !transform.any_value
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 320, 32]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [128, 2, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+           intrinsic = #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>,
+           subgroup_m_count = 2, subgroup_n_count = 2>
+         , no_reorder_workgroups}>
+      > -> !transform.any_param
+    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
+  }
+
+  transform.named_sequence @match_mmt_2048x1280x5120(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
+    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
+    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
+    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %lhs = tensor<2048x5120xf16> : !transform.any_value
+    transform.iree.match.cast_compatible_type %rhs = tensor<1280x5120xf16> : !transform.any_value
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 80, 128]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [64, 4, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+           intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
+           subgroup_m_count = 4, subgroup_n_count = 1>
+         , no_reorder_workgroups, llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+      > -> !transform.any_param
+    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
+  }
+
+
+  transform.named_sequence @match_mmt_2048x1280x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
+    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
+    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
+    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %lhs = tensor<2048x1280xf16> : !transform.any_value
+    transform.iree.match.cast_compatible_type %rhs = tensor<1280x1280xf16> : !transform.any_value
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [64, 4, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+           intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
+           subgroup_m_count = 4, subgroup_n_count = 1>
+         }>
+      > -> !transform.any_param
+    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
+  }
+
+  transform.named_sequence @match_mmt_8192x5120x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
+    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
+    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
+    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %lhs = tensor<8192x640xf16> : !transform.any_value
+    transform.iree.match.cast_compatible_type %rhs = tensor<5120x640xf16> : !transform.any_value
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 32]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [128, 2, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+           intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
+           subgroup_m_count = 2, subgroup_n_count = 2>
+         , no_reorder_workgroups}>
+      > -> !transform.any_param
+    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
+  }
+
+  transform.named_sequence @match_mmt_8192x640x2560(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
+    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
+    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
+    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %lhs = tensor<8192x2560xf16> : !transform.any_value
+    transform.iree.match.cast_compatible_type %rhs = tensor<640x2560xf16> : !transform.any_value
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 160, 64]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [128, 2, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+           intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
+           subgroup_m_count = 2, subgroup_n_count = 2>
+         , llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+      > -> !transform.any_param
+    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
+  }
+
+  transform.named_sequence @match_mmt_8192x640x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
+    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
+    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
+    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
+    transform.iree.match.cast_compatible_type %lhs = tensor<8192x640xf16> : !transform.any_value
+    transform.iree.match.cast_compatible_type %rhs = tensor<640x640xf16> : !transform.any_value
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 160, 64]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [64, 4, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+           intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
+           subgroup_m_count = 4, subgroup_n_count = 1>
+         , no_reorder_workgroups, llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+      > -> !transform.any_param
+    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
+  }
+
+//===----------------------------------------------------------------------===//
+// Contraction tuning
+//===----------------------------------------------------------------------===//
+
+  transform.named_sequence @match_contract_3x2x20x1024x64x1280(%contract: !transform.any_op {transform.readonly})
+    -> (!transform.any_op, !transform.any_param) {
+    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %contract {
+    ^bb0(%lhs: tensor<2x1024x1280xf16>, %rhs: tensor<3x20x64x1280xf16>, %out: tensor<3x2x20x1024x64xf32>):
+      %20 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>,
+                         affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>,
+                         affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4)>],
+        iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction"]
+      } ins(%lhs, %rhs : tensor<2x1024x1280xf16>, tensor<3x20x64x1280xf16>)
+          outs(%out : tensor<3x2x20x1024x64xf32>) {
+      ^bb0(%in: f16, %in_0: f16, %acc: f32):
+        %22 = arith.extf %in : f16 to f32
+        %23 = arith.extf %in_0 : f16 to f32
+        %24 = arith.mulf %22, %23 : f32
+        %25 = arith.addf %acc, %24 : f32
+        linalg.yield %25 : f32
+      } -> tensor<3x2x20x1024x64xf32>
+    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 1, 256, 384, 32]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [64, 4, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+          intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
+          subgroup_m_count = 4, subgroup_n_count = 1>
+        , llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+      > -> !transform.any_param
+    transform.yield %contract, %config : !transform.any_op, !transform.any_param
+  }
+
+  transform.named_sequence @match_contract_2x20x64x64x2048(%contract: !transform.any_op {transform.readonly})
+    -> (!transform.any_op, !transform.any_param) {
+    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %contract {
+    ^bb0(%lhs: tensor<2x64x2048xf16>, %rhs: tensor<20x64x2048xf16>, %out: tensor<2x20x64x64xf32>):
+        %14 = linalg.generic {
+          indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>,
+                          affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
+                          affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>],
+          iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]
+        } ins(%lhs, %rhs : tensor<2x64x2048xf16>, tensor<20x64x2048xf16>)
+          outs(%out : tensor<2x20x64x64xf32>) {
+        ^bb0(%in: f16, %in_0: f16, %acc: f32):
+          %16 = arith.extf %in : f16 to f32
+          %17 = arith.extf %in_0 : f16 to f32
+          %18 = arith.mulf %16, %17 : f32
+          %19 = arith.addf %acc, %18 : f32
+          linalg.yield %19 : f32
+        } -> tensor<2x20x64x64xf32>
+    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 128, 160, 128]]>,
+      translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+        workgroup_size = [128, 2, 1] subgroup_size = 64,
+        {mma_schedule = #iree_gpu.mma_schedule<
+          intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
+          subgroup_m_count = 2, subgroup_n_count = 2>
+        }>
+      > -> !transform.any_param
+    transform.yield %contract, %config : !transform.any_op, !transform.any_param
+  }
+
 //===----------------------------------------------------------------------===//
 // Entry point
 //===----------------------------------------------------------------------===//
@@ -469,6 +694,18 @@ module attributes { transform.with_named_sequence } {
         // Attention.
         @match_attention_len_512 -> @custom_attention_len_512,
         @match_attention -> @custom_attention
+
+        // Matmul.
+        , @match_mmt_2048x10240x1280 -> @apply_op_config
+        , @match_mmt_2048x1280x5120 -> @apply_op_config
+        , @match_mmt_2048x1280x1280 -> @apply_op_config
+        , @match_mmt_8192x5120x640 -> @apply_op_config
+        , @match_mmt_8192x640x2560 -> @apply_op_config
+        , @match_mmt_8192x640x640 -> @apply_op_config
+
+        // Contration.
+        , @match_contract_3x2x20x1024x64x1280 -> @apply_op_config
+        , @match_contract_2x20x64x64x2048 -> @apply_op_config
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index ba0ce3f25..a889e93c6 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -133,7 +133,7 @@ def compile_to_vmfb(
                 os.path.realpath(os.path.dirname(__file__)),
                 "default_mfma_attn_spec.mlir",
             )
-            flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
+        flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
 
     print("Compiling to", device, "with flags:", flags)
 

From 8309b6853cb996668a6c360acadd0d7cb5fac5d6 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 22 May 2024 15:10:20 -0500
Subject: [PATCH 064/174] Small fixes.

---
 .../custom_models/sd_inference/clip.py        | 82 ++++++++++++++++++-
 .../custom_models/sd_inference/sd_pipeline.py |  5 +-
 2 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index e3e23661e..eb4d496fa 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -44,6 +44,86 @@
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 
+class PromptEncoderModule(torch.nn.Module):
+    def __init__(
+        self,
+        hf_model_name,
+        precision,
+        hf_auth_token=None,
+        do_classifier_free_guidance=True,
+    ):
+        super().__init__()
+        self.torch_dtype = torch.float16 if precision == "fp16" else torch.float32
+        self.text_encoder_model_1 = CLIPTextModel.from_pretrained(
+            hf_model_name,
+            subfolder="text_encoder",
+            token=hf_auth_token,
+        )
+        self.text_encoder_model_2 = CLIPTextModelWithProjection.from_pretrained(
+            hf_model_name,
+            subfolder="text_encoder_2",
+            token=hf_auth_token,
+        )
+        self.do_classifier_free_guidance = do_classifier_free_guidance
+
+    def forward(
+        self, text_input_ids_1, text_input_ids_2, uncond_input_ids_1, uncond_input_ids_2
+    ):
+        with torch.no_grad():
+            prompt_embeds_1 = self.text_encoder_model_1(
+                text_input_ids_1,
+                output_hidden_states=True,
+            )
+            prompt_embeds_2 = self.text_encoder_model_2(
+                text_input_ids_2,
+                output_hidden_states=True,
+            )
+            neg_prompt_embeds_1 = self.text_encoder_model_1(
+                uncond_input_ids_1,
+                output_hidden_states=True,
+            )
+            neg_prompt_embeds_2 = self.text_encoder_model_2(
+                uncond_input_ids_2,
+                output_hidden_states=True,
+            )
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds_2[0]
+            neg_pooled_prompt_embeds = neg_prompt_embeds_2[0]
+
+            prompt_embeds_list = [
+                prompt_embeds_1.hidden_states[-2],
+                prompt_embeds_2.hidden_states[-2],
+            ]
+            neg_prompt_embeds_list = [
+                neg_prompt_embeds_1.hidden_states[-2],
+                neg_prompt_embeds_2.hidden_states[-2],
+            ]
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+            neg_prompt_embeds = torch.concat(neg_prompt_embeds_list, dim=-1)
+
+            bs_embed, seq_len, _ = prompt_embeds.shape
+
+            prompt_embeds = prompt_embeds.repeat(1, 1, 1)
+            prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)
+            pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(
+                bs_embed * 1, -1
+            )
+            add_text_embeds = pooled_prompt_embeds
+            if self.do_classifier_free_guidance:
+                neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(1, 1).view(
+                    1, -1
+                )
+                neg_prompt_embeds = neg_prompt_embeds.repeat(1, 1, 1)
+                neg_prompt_embeds = neg_prompt_embeds.view(bs_embed * 1, seq_len, -1)
+                prompt_embeds = torch.cat([neg_prompt_embeds, prompt_embeds], dim=0)
+                add_text_embeds = torch.cat(
+                    [neg_pooled_prompt_embeds, add_text_embeds], dim=0
+                )
+
+            add_text_embeds = add_text_embeds.to(self.torch_dtype)
+            prompt_embeds = prompt_embeds.to(self.torch_dtype)
+            return prompt_embeds, add_text_embeds
 
 def export_clip_model(
     hf_model_name,
@@ -79,7 +159,7 @@ def export_clip_model(
             )
             hf_subfolder = "text_encoder"
 
-        text_encoder_model = CLIPTextModel.from_pretrained(
+        text_encoder_model = PromptEncoderModule(
             hf_model_name,
             subfolder=hf_subfolder,
             token=hf_auth_token,
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 93e55a9fa..34bc75886 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -24,6 +24,7 @@
 import time
 from datetime import datetime as dt
 
+
 device_list = [
     "cpu",
     "vulkan",
@@ -60,7 +61,7 @@ def __init__(
         num_inference_steps: int,
         device: str,
         iree_target_triple: str,
-        ireec_flags: dict = EMPTY_FLAGS,
+        ireec_flags: dict = copy.deepcopy(SUBMODELS),
         attn_spec: str = None,
         decomp_attn: bool = False,
         pipeline_dir: str = "./shark_vmfbs",
@@ -183,7 +184,7 @@ def get_torch_models(self, submodel):
                 )
                 return unet_torch
             case "vae_decode":
-                if not self.custom_vae:
+                if not self.custom_vae and self.is_sdxl:
                     custom_vae = "madebyollin/sdxl-vae-fp16-fix" if self.precision == "fp16" and self.is_sdxl else None
                 vae_torch = vae.VaeModel(
                     self.hf_model_name,

From 7646f42722e0d2f8903f95aed43a4b3efdd0725c Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 00:29:58 -0500
Subject: [PATCH 065/174] Shore up SD1.5/2.1 implementations to match SDXL
 pipeline API / structure

---
 .../custom_models/sd_inference/clip.py        | 179 +++-----
 .../custom_models/sd_inference/schedulers.py  |  64 ++-
 .../custom_models/sd_inference/sd_cmd_opts.py |   8 +-
 .../custom_models/sd_inference/sd_pipeline.py | 145 +++----
 .../sd_inference/tokenization.py              | 404 ++++++++++++++++++
 .../custom_models/sd_inference/unet.py        | 156 +++----
 .../custom_models/sd_inference/utils.py       |  16 +-
 .../custom_models/sd_inference/vae.py         | 136 +++---
 .../custom_models/sdxl_inference/clip.py      |   9 +-
 .../sdxl_inference/sdxl_prompt_encoder.py     |   1 -
 10 files changed, 764 insertions(+), 354 deletions(-)
 create mode 100644 models/turbine_models/custom_models/sd_inference/tokenization.py

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index eb4d496fa..523f8a986 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -44,99 +44,45 @@
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 
-class PromptEncoderModule(torch.nn.Module):
-    def __init__(
-        self,
-        hf_model_name,
-        precision,
-        hf_auth_token=None,
-        do_classifier_free_guidance=True,
-    ):
-        super().__init__()
-        self.torch_dtype = torch.float16 if precision == "fp16" else torch.float32
-        self.text_encoder_model_1 = CLIPTextModel.from_pretrained(
-            hf_model_name,
-            subfolder="text_encoder",
-            token=hf_auth_token,
-        )
-        self.text_encoder_model_2 = CLIPTextModelWithProjection.from_pretrained(
-            hf_model_name,
-            subfolder="text_encoder_2",
-            token=hf_auth_token,
-        )
-        self.do_classifier_free_guidance = do_classifier_free_guidance
-
-    def forward(
-        self, text_input_ids_1, text_input_ids_2, uncond_input_ids_1, uncond_input_ids_2
-    ):
-        with torch.no_grad():
-            prompt_embeds_1 = self.text_encoder_model_1(
-                text_input_ids_1,
-                output_hidden_states=True,
-            )
-            prompt_embeds_2 = self.text_encoder_model_2(
-                text_input_ids_2,
-                output_hidden_states=True,
-            )
-            neg_prompt_embeds_1 = self.text_encoder_model_1(
-                uncond_input_ids_1,
-                output_hidden_states=True,
-            )
-            neg_prompt_embeds_2 = self.text_encoder_model_2(
-                uncond_input_ids_2,
-                output_hidden_states=True,
-            )
-            # We are only ALWAYS interested in the pooled output of the final text encoder
-            pooled_prompt_embeds = prompt_embeds_2[0]
-            neg_pooled_prompt_embeds = neg_prompt_embeds_2[0]
-
-            prompt_embeds_list = [
-                prompt_embeds_1.hidden_states[-2],
-                prompt_embeds_2.hidden_states[-2],
-            ]
-            neg_prompt_embeds_list = [
-                neg_prompt_embeds_1.hidden_states[-2],
-                neg_prompt_embeds_2.hidden_states[-2],
-            ]
-
-            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
-            neg_prompt_embeds = torch.concat(neg_prompt_embeds_list, dim=-1)
-
-            bs_embed, seq_len, _ = prompt_embeds.shape
-
-            prompt_embeds = prompt_embeds.repeat(1, 1, 1)
-            prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)
-            pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(
-                bs_embed * 1, -1
-            )
-            add_text_embeds = pooled_prompt_embeds
-            if self.do_classifier_free_guidance:
-                neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(1, 1).view(
-                    1, -1
-                )
-                neg_prompt_embeds = neg_prompt_embeds.repeat(1, 1, 1)
-                neg_prompt_embeds = neg_prompt_embeds.view(bs_embed * 1, seq_len, -1)
-                prompt_embeds = torch.cat([neg_prompt_embeds, prompt_embeds], dim=0)
-                add_text_embeds = torch.cat(
-                    [neg_pooled_prompt_embeds, add_text_embeds], dim=0
-                )
-
-            add_text_embeds = add_text_embeds.to(self.torch_dtype)
-            prompt_embeds = prompt_embeds.to(self.torch_dtype)
-            return prompt_embeds, add_text_embeds
 
-def export_clip_model(
+def export_clip(
     hf_model_name,
-    hf_auth_token=None,
-    compile_to="torch",
-    external_weights=None,
-    external_weight_path=None,
-    device=None,
-    target_triple=None,
-    max_alloc=None,
-    upload_ir=False,
+    hf_auth_token: str = None,
+    max_length: int = 64,
+    precision: str = "fp16",
+    compile_to: str = "torch",
+    external_weights: str = None,
+    external_weight_path: str = None,
+    device: str = "llvm-cpu",
+    target_triple: str = "x86_64-linux-gnu",
+    ireec_flags: str = None,
+    exit_on_vmfb: bool = False,
+    pipeline_dir: str = None,
+    input_mlir: str = None,
+    td_spec: str = None,
+    weights_only: bool = False,
+    upload_ir: bool = False,
 ):
-    input_len = 77
+    input_len = max_length
+    if pipeline_dir not in [None, ""]:
+        safe_name = os.path.join(pipeline_dir, "clip")
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name, f"_{str(max_length)}-{precision}-clip-{device}"
+        )
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+            const_expr_hoisting=True,
+            attn_spec=td_spec,
+        )
+        return vmfb_path
     if "google/t5" in hf_model_name:
         from transformers import T5Tokenizer, T5Model
 
@@ -159,7 +105,7 @@ def export_clip_model(
             )
             hf_subfolder = "text_encoder"
 
-        text_encoder_model = PromptEncoderModule(
+        text_encoder_model = CLIPTextModel.from_pretrained(
             hf_model_name,
             subfolder=hf_subfolder,
             token=hf_auth_token,
@@ -170,6 +116,9 @@ def export_clip_model(
         mapper, text_encoder_model, external_weights, external_weight_path
     )
 
+    if weights_only:
+        return external_weight_path
+    
     if "google/t5" in hf_model_name:
 
         class CompiledClip(CompiledModule):
@@ -212,38 +161,46 @@ def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
     inst = CompiledClip(context=Context(), import_to=import_to)
 
     module_str = str(CompiledModule.get_mlir_module(inst))
-    safe_name = utils.create_safe_name(hf_model_name, "-clip")
-    if upload_ir:
-        with open(f"{safe_name}.mlir", "w+") as f:
-            f.write(module_str)
-        model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload += "-clip"
-        blob_name = turbine_tank.uploadToBlobStorage(
-            str(os.path.abspath(f"{safe_name}.mlir")),
-            f"{model_name_upload}/{model_name_upload}.mlir",
-        )
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
-        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
-        if upload_ir:
-            return blob_name
-
+        vmfb_path = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=not exit_on_vmfb,
+            const_expr_hoisting=True,
+            attn_spec=td_spec,
+        )
+        return None, vmfb_path
 
 if __name__ == "__main__":
-    args = parser.parse_args()
-    mod_str, _ = export_clip_model(
+    from .sd_cmd_opts import args
+
+    mod_str, _ = export_clip(
         args.hf_model_name,
         args.hf_auth_token,
+        args.batch_size,
+        args.max_length,
+        args.precision,
         args.compile_to,
         args.external_weights,
         args.external_weight_path,
         args.device,
         args.iree_target_triple,
-        args.vulkan_max_allocation,
+        args.ireec_flags + args.clip_flags,
+        exit_on_vmfb=True,
+        pipeline_dir=args.pipeline_dir,
+        input_mlir=args.input_mlir,
+        attn_spec=args.attn_spec,
+    )
+    if args.input_mlir:
+        exit()
+    safe_name = utils.create_safe_name(
+        args.hf_model_name, f"{str(args.max_length)}_{args.precision}_clip"
     )
-    safe_name = args.hf_model_name.split("/")[-1].strip()
-    safe_name = re.sub("-", "_", safe_name)
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")
+    print("Saved to", safe_name + ".mlir")
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 7990ae6ed..25f0950fd 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -10,10 +10,28 @@
 import torch
 from shark_turbine.aot import *
 from iree.compiler.ir import Context
+import iree.runtime as ireert
 import numpy as np
 
 from turbine_models.turbine_tank import turbine_tank
 from turbine_models.custom_models.sd_inference import utils
+from turbine_models.model_runner import vmfbRunner
+
+class SharkSchedulerWrapper():
+    def __init__(self, rt_device, vmfb, weights):
+        self.runner = vmfbRunner(
+            rt_device, vmfb, weights
+        )
+    
+    def initialize(self, sample):
+        return self.runner.ctx.modules.scheduler["initialize"](sample)
+    
+    def scale_model_input(self, sample, t):
+        return self.runner.ctx.modules.scheduler["scale_model_input"](sample, t)
+    
+    def step(self, sample, latents, t):
+        return self.runner.ctx.modules.scheduler["step"](sample, latents, t)
+
 
 class SchedulingModel(torch.nn.Module):
     def __init__(self, scheduler, height, width):
@@ -31,11 +49,34 @@ def initialize(self, sample):
         add_time_ids = torch.tensor([add_time_ids])
         add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
         add_time_ids = add_time_ids.repeat(sample.shape[0], 1).type(self.dtype)
-        timesteps = self.scheduler.timesteps
+        timesteps = self.model.timesteps
         step_indexes = torch.tensor(len(timesteps))
-        sample = sample * self.scheduler.init_noise_sigma
+        sample = sample * self.model.init_noise_sigma
         return sample.type(self.dtype), add_time_ids, step_indexes
-
+    
+    def scale_model_input(self, sample, t):
+        self.model.scale_model_input(sample, t)
+
+    def step(self, sample, latents, t):
+        self.model.step(self, sample, latents, t)
+
+class SharkSchedulerCPUWrapper(SchedulingModel):
+    def __init__(self, pipe, scheduler, height, width):
+        super().__init__(scheduler, height, width)
+        self.dest = pipe.runner["unet"].config.device
+        self.dtype = pipe.iree_dtype
+    
+    def initialize(self, sample):
+        for output in super().initialize(sample):
+            iree_arrays = ireert.asdevicearray(self.dest, output, self.dtype)
+        
+        return iree_arrays
+    
+    def scale_model_input(self, sample, t):
+        return ireert.asdevicearray(self.dest, super.scale_model_input(sample, t), self.dtype)
+    
+    def step(self, sample, latents, t):
+        return ireert.asdevicearray(self.dest, super.step(sample.to_host(), latents.to_host(), t.to_host()), self.dtype)
 
 def export_scheduler(
     hf_model_name: str,
@@ -106,11 +147,26 @@ def export_scheduler(
     class CompiledScheduler(CompiledModule):
         params = export_parameters(scheduled_unet_model)
 
-        def run_initialize(
+        def initialize(
             self,
             sample=AbstractTensor(*sample, dtype=dtype),
         ):
             return jittable(scheduler_module.initialize)(sample)
+        
+        def scale_model_input(
+            self,
+            sample=AbstractTensor(*sample, dtype=dtype),
+            t=AbstractTensor(1, dtype=dtype),
+        ):
+            return jittable(scheduler_module.scale_model_input)(sample, t)
+        
+        def step(
+            self,
+            sample=AbstractTensor(*sample, dtype=dtype),
+            latents=AbstractTensor(1, dtype=dtype),
+            t=AbstractTensor(1, dtype=dtype),
+        ):
+            return jittable(scheduler_module.step)(sample, latents, t)
 
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
     inst = CompiledScheduler(context=Context(), import_to=import_to)
diff --git a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
index 9b1a41767..e56737369 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
@@ -20,7 +20,7 @@ def is_valid_file(arg):
 
 # We should consider separating out the options that are "model configs" from
 # the options that control the compiler, runtime, and script behavior,
-# when applicable, as the formermost would best be kept in a separate
+# when applicable, as the former would best be kept in a separate
 # config or imported from huggingface.
 
 p = argparse.ArgumentParser(
@@ -41,13 +41,13 @@ def is_valid_file(arg):
     "--hf_model_name",
     type=str,
     help="HF model name",
-    default="stabilityai/stable-diffusion-xl-base-1.0",
+    default="stabilityai/stable-diffusion-2-1",
 )
 p.add_argument(
     "--scheduler_id",
     type=str,
     help="Scheduler ID",
-    default="PNDM",
+    default="Euler",
 )
 
 ##############################################################################
@@ -286,4 +286,4 @@ def is_valid_file(arg):
 )
 
 
-args, unknown_args = p.parse_known_args()
+args, unknown = p.parse_known_args()
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 34bc75886..f47dda536 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -15,8 +15,10 @@
     schedulers,
     utils,
 )
+from .tokenization import get_weighted_text_embeddings
 from turbine_models.model_runner import vmfbRunner
 from transformers import CLIPTokenizer
+from pathlib import Path
 
 from PIL import Image
 import os
@@ -64,17 +66,20 @@ def __init__(
         ireec_flags: dict = copy.deepcopy(SUBMODELS),
         attn_spec: str = None,
         decomp_attn: bool = False,
-        pipeline_dir: str = "./shark_vmfbs",
-        external_weights_dir: str = "./shark_weights",
+        pipeline_dir: str | Path = "./shark_vmfbs",
+        external_weights_dir: str | Path = "./shark_weights",
         external_weights: str = "safetensors",
+        custom_vae: str = None,
         vae_decomp_attn: bool = True,
     ):
         self.hf_model_name = hf_model_name
+        self.cpu_scheduling = True
         self.scheduler_id = scheduler_id
         self.height = height
         self.width = width
         self.precision = precision
         self.max_length = max_length
+        self.model_max_length = max_length
         self.batch_size = batch_size
         self.num_inference_steps = num_inference_steps
         self.device = device
@@ -85,6 +90,7 @@ def __init__(
         self.pipeline_dir = pipeline_dir
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
+        self.custom_vae = custom_vae
         self.vae_decomp_attn = vae_decomp_attn
         self.is_sdxl = "xl" in self.hf_model_name
 
@@ -176,19 +182,12 @@ def get_torch_models(self, submodel):
             case "unet":
                 unet_torch = unet.UnetModel(
                     self.hf_model_name,
-                    self.height,
-                    self.width,
-                    self.batch_size,
-                    None,
-                    precision=self.precision,
                 )
                 return unet_torch
             case "vae_decode":
-                if not self.custom_vae and self.is_sdxl:
-                    custom_vae = "madebyollin/sdxl-vae-fp16-fix" if self.precision == "fp16" and self.is_sdxl else None
                 vae_torch = vae.VaeModel(
                     self.hf_model_name,
-                    custom_vae,
+                    self.custom_vae,
                 )
                 return vae_torch
 
@@ -239,7 +238,7 @@ def export_submodel(
             input_mlir = copy.deepcopy(SUBMODELS)
         match submodel:
             case "clip":
-                _, clip_vmfb = clip.export_combined_clip(
+                _, clip_vmfb = clip.export_clip(
                     self.hf_model_name,
                     None,
                     self.max_length,
@@ -253,12 +252,14 @@ def export_submodel(
                     exit_on_vmfb=False,
                     pipeline_dir=self.pipeline_dir,
                     input_mlir=input_mlir["clip"],
-                    attn_spec=self.attn_spec,
+                    td_spec=self.attn_spec,
                     weights_only=weights_only,
                 )
                 return clip_vmfb, clip_external_weight_path
             case "scheduler":
-                scheduler_vmfb = schedulers.export_scheduler(
+                if self.cpu_scheduling:
+                    return utils.get_scheduler(self.hf_model_name, self.scheduler_id), None
+                scheduler = schedulers.export_scheduler(
                     self.hf_model_name,
                     self.scheduler_id,
                     self.batch_size,
@@ -274,7 +275,7 @@ def export_submodel(
                     pipeline_dir=self.pipeline_dir,
                     input_mlir=input_mlir["scheduler"],
                 )
-                return scheduler_vmfb, None
+                return scheduler, None
             case "unet":
                 if input_mlir[submodel]:
                     unet_torch = None
@@ -358,9 +359,11 @@ def load_pipeline(
         runners["clip"] = vmfbRunner(
             rt_device, vmfbs["clip"], weights["clip"]
         )
-        runners["scheduler"] = vmfbRunner(
-            rt_device, vmfbs["scheduler"], weights["scheduler"]
-        )
+        if isinstance(vmfbs["scheduler"], torch.nn.Module):
+            self.scheduler = schedulers.SchedulingModel(vmfbs['scheduler'], self.height, self.width)
+        else:
+            self.scheduler = schedulers.SharkSchedulerWrapper(rt_device, vmfbs["scheduler"], weights["scheduler"])
+
         runners["unet"] = vmfbRunner(
             rt_device, vmfbs["unet"], weights["unet"]
         )
@@ -385,17 +388,13 @@ def generate_images(
         # TODO: implement case where this is false e.g. in SDXL Turbo
         # do_classifier_free_guidance = True
 
-        iree_dtype = "float32" if self.precision == "fp32" else "float16"
+        self.iree_dtype = "float32" if self.precision == "fp32" else "float16"
         torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
 
         pipe_start = time.time()
-
-        max_length = self.max_length
-
         samples = []
         numpy_images = []
 
-
         for i in range(batch_count):
             generator = torch.random.manual_seed(seed + i)
             rand_sample = torch.randn(
@@ -410,82 +409,74 @@ def generate_images(
             )
             samples.append(
                 ireert.asdevicearray(
-                    self.runners["unet"].config.device, rand_sample, dtype=iree_dtype
+                    self.runners["unet"].config.device, rand_sample, dtype=self.iree_dtype
                 )
             )
 
         guidance_scale = ireert.asdevicearray(
             self.runners["unet"].config.device,
             np.asarray([guidance_scale]),
-            dtype=iree_dtype,
+            dtype=self.iree_dtype,
         )
 
-        text_input_ids_list = []
-        uncond_input_ids_list = []
-
         tokenize_start = time.time()
 
         # Tokenize prompt and negative prompt.
-        for tokenizer in self.runners["tokenizers"]:
-            text_inputs = tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_input = tokenizer(
-                negative_prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            uncond_input_ids = uncond_input.input_ids
-
-            text_input_ids_list.extend(
-                [
-                    ireert.asdevicearray(
-                        self.runners["unet"].config.device, text_input_ids
-                    )
-                ]
-            )
-            uncond_input_ids_list.extend(
-                [
-                    ireert.asdevicearray(
-                        self.runners["unet"].config.device, uncond_input_ids
-                    )
-                ]
-            )
-        encode_prompts_start = time.time()
 
-        prompt_embeds, add_text_embeds = self.runners[
-            "clip"
-        ].ctx.modules.compiled_clip["encode_prompts"](
-            *text_input_ids_list, *uncond_input_ids_list
-        )
+        prompt_embeds, negative_embeds = get_weighted_text_embeddings(self, prompt, negative_prompt)
 
         encode_prompts_end = time.time()
 
         for i in range(batch_count):
             unet_start = time.time()
+        
+            sample, add_time_ids, timesteps = self.scheduler.initialize(samples[i])
 
-            sample, add_time_ids, timesteps = self.runners["scheduler"].ctx.modules.scheduler[
-                "init"
-            ](samples[i])
+            if self.is_img2img:
+                init_timestep = min(
+                    int(num_inference_steps * strength), num_inference_steps
+                )
+                t_start = max(num_inference_steps - init_timestep, 0)
+                timesteps = self.scheduler.timesteps[t_start:]
+                latents = self.encode_image(image)
+                latents = self.scheduler.add_noise(latents, noise, timesteps[0].repeat(1))
+                return latents, [timesteps]
+
+            if self.cpu_scheduling:
+                sample = ireert.asdevicearray(
+                    self.runners["unet"].config.device,
+                    np.asarray(sample),
+                    dtype=self.iree_dtype
+                )
+                add_time_ids = ireert.asdevicearray(
+                    self.runners["unet"].config.device,
+                    np.asarray(add_time_ids),
+                    dtype=self.iree_dtype
+                )
+                timesteps = ireert.asdevicearray(
+                    self.runners["unet"].config.device,
+                    np.asarray(timesteps),
+                    dtype=self.iree_dtype
+                )
 
             for t in range(timesteps):
-                latents = self.runners["scheduler"].ctx.modules.scheduler["scale"](
+                latents = self.scheduler.scale_model_input(
                     sample, t
                 )
                 latents = self.runners["unet"].ctx.modules.compiled_unet["main"](
-                    latents, prompt_embeds, add_text_embeds, add_time_ids, guidance_scale, t
+                    latents, prompt_embeds, negative_embeds, add_time_ids, guidance_scale, t
                 )
-                sample = self.runners["scheduler"].ctx.modules.scheduler["step"](
+                sample = self.scheduler.step(
                     sample, latents, t
                 )
             
+            if self.cpu_scheduling:
+                sample = ireert.asdevicearray(
+                    self.runners["vae_decode"].config.device,
+                    np.asarray(sample),
+                    dtype=self.iree_dtype
+                )
+            
             vae_start = time.time()
             vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
                 sample
@@ -512,18 +503,18 @@ def generate_images(
             print("VAE time: ", pipe_end - vae_start, "sec")
             print(
                 f"\nTotal time (txt2img, batch #{str(i+1)}): ",
-                (encode_prompts_end - encode_prompts_start)
+                (encode_prompts_end - tokenize_start)
                 + (pipe_end - unet_start),
                 "sec\n",
             )
         end = time.time()
-        print("Total CLIP time:", encode_prompts_end - encode_prompts_start, "sec")
-        print("Total tokenize time:", encode_prompts_start - tokenize_start, "sec")
-        print("Loading time: ", encode_prompts_start - pipe_start, "sec")
+        print("Total CLIP time:", encode_prompts_end - tokenize_start, "sec")
+        print("Total tokenize time:", tokenize_start - tokenize_start, "sec")
+        print("Loading time: ", tokenize_start - pipe_start, "sec")
         if batch_count > 1:
             print(
                 f"Total inference time ({batch_count} batch(es)):",
-                end - encode_prompts_start,
+                end - tokenize_start,
                 "sec",
             )
         timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -558,7 +549,7 @@ def numpy_to_pil_image(images):
 
 
 if __name__ == "__main__":
-    from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
 
     mlirs = copy.deepcopy(SUBMODELS)
     vmfbs = copy.deepcopy(SUBMODELS)
diff --git a/models/turbine_models/custom_models/sd_inference/tokenization.py b/models/turbine_models/custom_models/sd_inference/tokenization.py
new file mode 100644
index 000000000..83bdcb881
--- /dev/null
+++ b/models/turbine_models/custom_models/sd_inference/tokenization.py
@@ -0,0 +1,404 @@
+from typing import List, Optional, Union
+from iree import runtime as ireert
+import re
+import torch
+import numpy as np
+
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs:
+        text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        print(
+            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
+        )
+    return tokens, weights
+
+
+def pad_tokens_and_weights(
+    tokens,
+    weights,
+    max_length,
+    bos,
+    eos,
+    no_boseos_middle=True,
+    chunk_length=77,
+):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = (
+        max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    )
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][
+                        j
+                        * (chunk_length - 2) : min(
+                            len(weights[i]), (j + 1) * (chunk_length - 2)
+                        )
+                    ]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+
+    return tokens, weights
+
+
+def get_unweighted_text_embeddings(
+    pipe,
+    text_input,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[
+                :, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2
+            ].clone()
+
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+
+            text_input_chunk = ireert.asdevicearray(pipe.runners["clip"].config.device, text_input_chunk, pipe.iree_dtype)
+            text_embedding = pipe.runners["clip"].ctx.modules.compiled_clip["encode_prompts"](
+            text_input_chunk
+            ).to_host()
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+
+            text_embeddings.append(text_embedding)
+        # SHARK: Convert the result to tensor
+        # text_embeddings = torch.concat(text_embeddings, axis=1)
+        text_embeddings_np = np.concatenate(np.array(text_embeddings))
+        text_embeddings = torch.from_numpy(text_embeddings_np)
+    else:
+        text_embeddings = pipe.run("clip", text_input)[0]
+        text_embeddings = torch.from_numpy(text_embeddings.to_host())
+    return text_embeddings
+
+
+# This function deals with NoneType values occuring in tokens after padding
+# It switches out None with 49407 as truncating None values causes matrix dimension errors,
+def filter_nonetype_tokens(tokens: List[List]):
+    return [[49407 if token is None else token for token in tokens[0]]]
+
+def get_tokenized_inputs(
+    pipe,
+    tokenizer,
+    prompt,
+    uncond_prompt,
+    max_length,
+    max_embeddings_multiples: Optional[int] = 8,
+    no_boseos_middle: Optional[bool] = True,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+):
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(
+            pipe, prompt, max_length - 2
+        )
+        if uncond_prompt is not None:
+            uncond_tokens, uncond_weights = get_prompts_with_weights(
+                pipe, uncond_prompt, max_length - 2
+            )
+    else:
+        prompt_tokens = [
+            token[1:-1]
+            for token in tokenizer(
+                prompt, max_length=max_length, truncation=True
+            ).input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in tokenizer(
+                    uncond_prompt, max_length=max_length, truncation=True
+                ).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
+
+    # pad the length of tokens and weights
+    bos = tokenizer.bos_token_id
+    eos = tokenizer.eos_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.model_max_length,
+    )
+
+    # FIXME: This is a hacky fix caused by tokenizer padding with None values
+    prompt_tokens = filter_nonetype_tokens(prompt_tokens)
+
+    # prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device="cpu")
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.model_max_length,
+        )
+
+        # FIXME: This is a hacky fix caused by tokenizer padding with None values
+        uncond_tokens = filter_nonetype_tokens(uncond_tokens)
+
+        # uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device="cpu")
+    if uncond_prompt is not None:
+        return prompt_tokens, prompt_weights, uncond_tokens, uncond_weights
+    else:
+        return prompt_tokens, prompt_weights, None, None
+
+def get_weighted_text_embeddings(
+    pipe,
+    prompt: List[str],
+    uncond_prompt: List[str] = None,
+    max_embeddings_multiples: Optional[int] = 8,
+    no_boseos_middle: Optional[bool] = True,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+):
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
+    for tokenizer in pipe.runners['tokenizers']:
+        prompt_tokens, prompt_weights, uncond_tokens, uncond_weights = get_tokenized_inputs(
+            pipe,
+            tokenizer,
+            prompt,
+            uncond_prompt,
+            max_length,
+            max_embeddings_multiples,
+            no_boseos_middle,
+            skip_parsing,
+            skip_weighting
+        )
+
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.model_max_length,
+        no_boseos_middle=no_boseos_middle,
+    )
+    # prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    prompt_weights = torch.tensor(prompt_weights, dtype=torch.float, device="cpu")
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.model_max_length,
+            no_boseos_middle=no_boseos_middle,
+        )
+        # uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+        uncond_weights = torch.tensor(uncond_weights, dtype=torch.float, device="cpu")
+
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = (
+            text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        )
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        current_mean = (
+            text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        )
+        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        if uncond_prompt is not None:
+            previous_mean = (
+                uncond_embeddings.float()
+                .mean(axis=[-2, -1])
+                .to(uncond_embeddings.dtype)
+            )
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            current_mean = (
+                uncond_embeddings.float()
+                .mean(axis=[-2, -1])
+                .to(uncond_embeddings.dtype)
+            )
+            uncond_embeddings *= (
+                (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+            )
+
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index d5ee63ae1..3d3137823 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -6,6 +6,7 @@
 
 import os
 import sys
+import copy
 
 from iree import runtime as ireert
 from iree.compiler.ir import Context
@@ -23,50 +24,10 @@
 import argparse
 from turbine_models.turbine_tank import turbine_tank
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--hf_auth_token", type=str, help="The Hugging Face auth token, required"
-)
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="CompVis/stable-diffusion-v1-4",
-)
-parser.add_argument(
-    "--batch_size", type=int, default=1, help="Batch size for inference"
-)
-parser.add_argument(
-    "--height", type=int, default=512, help="Height of Stable Diffusion"
-)
-parser.add_argument("--width", type=int, default=512, help="Width of Stable Diffusion")
-parser.add_argument(
-    "--precision", type=str, default="fp16", help="Precision of Stable Diffusion"
-)
-parser.add_argument(
-    "--max_length", type=int, default=77, help="Sequence Length of Stable Diffusion"
-)
-parser.add_argument("--compile_to", type=str, help="torch, linalg, vmfb")
-parser.add_argument("--external_weight_path", type=str, default="")
-parser.add_argument(
-    "--external_weights",
-    type=str,
-    default=None,
-    help="saves ir/vmfb without global weights for size and readability, options [safetensors]",
-)
-parser.add_argument("--device", type=str, default="cpu", help="cpu, cuda, vulkan, rocm")
-# TODO: Bring in detection for target triple
-parser.add_argument(
-    "--iree_target_triple",
-    type=str,
-    default="",
-    help="Specify vulkan target triple or rocm/cuda target device.",
-)
-parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 
 
 class UnetModel(torch.nn.Module):
-    def __init__(self, hf_model_name, hf_auth_token=None):
+    def __init__(self, hf_model_name):
         super().__init__()
         self.unet = UNet2DConditionModel.from_pretrained(
             hf_model_name,
@@ -84,7 +45,6 @@ def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
         )
         return noise_pred
 
-
 def export_unet_model(
     unet_model,
     hf_model_name,
@@ -99,13 +59,43 @@ def export_unet_model(
     external_weight_path=None,
     device=None,
     target_triple=None,
-    max_alloc=None,
-    upload_ir=False,
-    decomp_attn=True,
+    ireec_flags=None,
+    decomp_attn=False,
+    exit_on_vmfb=False,
+    pipeline_dir=None,
+    attn_spec=None,
+    input_mlir=None,
+    weights_only=False,
 ):
+    if "turbo" in hf_model_name:
+        do_classifier_free_guidance = False
+    else:
+        do_classifier_free_guidance = True
+    if pipeline_dir:
+        safe_name = os.path.join(
+            pipeline_dir, f"unet"
+        )
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name,
+            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}",
+        )
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+            attn_spec=attn_spec,
+        )
+        return vmfb_path
+
     mapper = {}
-    decomp_list = DEFAULT_DECOMPOSITIONS
-    if decomp_attn:
+    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
+    if decomp_attn == True:
         decomp_list.extend(
             [
                 torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
@@ -113,18 +103,30 @@ def export_unet_model(
             ]
         )
     dtype = torch.float16 if precision == "fp16" else torch.float32
-    unet_model = unet_model.to(dtype)
+
+    if precision == "fp16":
+        unet_model = unet_model.half()
+
     utils.save_external_weights(
         mapper, unet_model, external_weights, external_weight_path
     )
+
+    if weights_only:
+        return external_weight_path
+
+    sample = (
+        batch_size,
+        unet_model.unet.config.in_channels,
+        height // 8,
+        width // 8,
+    )
+
     encoder_hidden_states_sizes = (
         unet_model.unet.config.layers_per_block,
         max_length,
         unet_model.unet.config.cross_attention_dim,
     )
 
-    sample = (batch_size, unet_model.unet.config.in_channels, height // 8, width // 8)
-
     class CompiledUnet(CompiledModule):
         if external_weights:
             params = export_parameters(
@@ -150,30 +152,29 @@ def main(
     inst = CompiledUnet(context=Context(), import_to=import_to)
 
     module_str = str(CompiledModule.get_mlir_module(inst))
-    safe_name = utils.create_safe_name(hf_model_name, "-unet")
-    if upload_ir:
-        with open(f"{safe_name}.mlir", "w+") as f:
-            f.write(module_str)
-        model_name_upload = hf_model_name.replace("/", "-")
-        model_name_upload += "_unet"
-        blob_name = turbine_tank.uploadToBlobStorage(
-            str(os.path.abspath(f"{safe_name}.mlir")),
-            f"{model_name_upload}/{model_name_upload}.mlir",
-        )
+
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
-        if upload_ir:
-            return blob_name
+        utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=False,
+            attn_spec=attn_spec,
+        )
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
-    unet_model = UnetModel(
-        args.hf_model_name,
-        args.hf_auth_token,
-    )
+    from .sd_cmd_opts import args
+    if args.input_mlir:
+        unet_model = None
+    else:
+        unet_model = UnetModel(
+            args.hf_model_name,
+        )
     mod_str = export_unet_model(
         unet_model,
         args.hf_model_name,
@@ -188,10 +189,17 @@ def main(
         args.external_weight_path,
         args.device,
         args.iree_target_triple,
-        args.vulkan_max_allocation,
+        args.ireec_flags + args.attn_flags + args.unet_flags,
+        args.decomp_attn,
+        attn_spec=args.attn_spec,
+        input_mlir=args.input_mlir,
+    )
+    if args.input_mlir:
+        exit()
+    safe_name = utils.create_safe_name(
+        args.hf_model_name,
+        f"_bs{args.batch_size}_{args.max_length}_{args.height}x{args.width}_{args.precision}_unet",
     )
-    if mod_str is not None:
-        safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
-        with open(f"{safe_name}.mlir", "w+") as f:
-            f.write(mod_str)
-        print("Saved to", safe_name + ".mlir")
+    with open(f"{safe_name}.mlir", "w+") as f:
+        f.write(mod_str)
+    print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index a889e93c6..35245100d 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -10,7 +10,7 @@
 )
 
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
-amdgpu_flags = {
+MI_flags = {
     "all": [
         "--iree-global-opt-propagate-transposes=true",
         "--iree-opt-outer-dim-concat=true",
@@ -18,7 +18,8 @@
         "--iree-llvmgpu-enable-prefetch=true",
         "--iree-opt-data-tiling=false",
         "--iree-codegen-gpu-native-math-precision=true",
-        "--iree-codegen-llvmgpu-use-vector-distribution=true",
+        "--iree-rocm-waves-per-eu=2",
+        "--iree-codegen-llvmgpu-use-vector-distribution=true", 
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
     "unet": [
@@ -30,6 +31,7 @@
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
+        "--iree-codegen-llvmgpu-use-vector-distribution=true"
     ],
     "vae": ["--iree-flow-enable-aggressive-fusion"],
 }
@@ -114,14 +116,14 @@ def compile_to_vmfb(
         if flag not in [None, "", " "]:
             flags.append(flag)
 
-    if target_triple in ["gfx940", "gfx941", "gfx942", "gfx1100", "gfx90a"]:
+    if target_triple in ["gfx940", "gfx941", "gfx942", "gfx90a"]:
         if "unet" in safe_name:
-            flags.extend(amdgpu_flags["unet"])
+            flags.extend(MI_flags["unet"])
         elif any(x in safe_name for x in ["clip", "prompt_encoder"]):
-            flags.extend(amdgpu_flags["clip"])
+            flags.extend(MI_flags["clip"])
         elif "vae" in safe_name:
-            flags.extend(amdgpu_flags["vae"])
-        flags.extend(amdgpu_flags["all"])
+            flags.extend(MI_flags["vae"])
+        flags.extend(MI_flags["all"])
 
     # Currently, we need a transform dialect script to be applied to the compilation through IREE in certain cases.
     # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 0916acda0..2286ebdf2 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -21,43 +21,6 @@
 import argparse
 from turbine_models.turbine_tank import turbine_tank
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="CompVis/stable-diffusion-v1-4",
-)
-parser.add_argument(
-    "--batch_size", type=int, default=1, help="Batch size for inference"
-)
-parser.add_argument(
-    "--height", type=int, default=512, help="Height of Stable Diffusion"
-)
-parser.add_argument("--width", type=int, default=512, help="Width of Stable Diffusion")
-parser.add_argument(
-    "--precision", type=str, default="fp32", help="Precision of Stable Diffusion"
-)
-parser.add_argument("--compile_to", type=str, help="torch, linalg, vmfb")
-parser.add_argument("--external_weight_path", type=str, default="")
-parser.add_argument(
-    "--external_weights",
-    type=str,
-    default=None,
-    help="saves ir/vmfb without global weights for size and readability, options [safetensors]",
-)
-parser.add_argument("--device", type=str, default="cpu", help="cpu, cuda, vulkan, rocm")
-# TODO: Bring in detection for target triple
-parser.add_argument(
-    "--iree_target_triple",
-    type=str,
-    default="",
-    help="Specify vulkan target triple or rocm/cuda target device.",
-)
-parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-parser.add_argument("--variant", type=str, default="decode")
-
-
 class VaeModel(torch.nn.Module):
     def __init__(
         self,
@@ -113,11 +76,33 @@ def export_vae_model(
     external_weight_path=None,
     device=None,
     target_triple=None,
-    max_alloc=None,
+    ireec_flags=None,
     variant="decode",
-    upload_ir=False,
-    decomp_attn=True,
+    decomp_attn=False,
+    exit_on_vmfb=False,
+    pipeline_dir=None,
+    attn_spec=None,
+    input_mlir=None,
+    weights_only=False,
 ):
+    if pipeline_dir:
+        safe_name = os.path.join(pipeline_dir, "vae_" + variant)
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name, f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}"
+        )
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+            attn_spec=attn_spec,
+        )
+        return vmfb_path
     mapper = {}
     decomp_list = DEFAULT_DECOMPOSITIONS
     if decomp_attn:
@@ -132,7 +117,8 @@ def export_vae_model(
     utils.save_external_weights(
         mapper, vae_model, external_weights, external_weight_path
     )
-
+    if weights_only:
+        return external_weight_path
     sample = (batch_size, 4, height // 8, width // 8)
     if variant == "encode":
         sample = (batch_size, 3, height, width)
@@ -150,45 +136,55 @@ def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
     inst = CompiledVae(context=Context(), import_to=import_to)
 
     module_str = str(CompiledModule.get_mlir_module(inst))
-    safe_name = utils.create_safe_name(hf_model_name, "-vae")
-    if upload_ir:
-        with open(f"{safe_name}.mlir", "w+") as f:
-            f.write(module_str)
-        model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload = model_name_upload + "-vae-" + variant
-        blob_name = turbine_tank.uploadToBlobStorage(
-            str(os.path.abspath(f"{safe_name}.mlir")),
-            f"{model_name_upload}/{model_name_upload}.mlir",
-        )
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
-        if upload_ir:
-            return blob_name
+        vmfb_path = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=not exit_on_vmfb,
+            attn_spec=attn_spec,
+        )
+        return vmfb_path
+
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
-    vae_model = VaeModel(
-        args.hf_model_name,
-    )
+    from .sd_cmd_opts import args
+    if args.input_mlir:
+        vae_model = None
+    else:
+        vae_model = VaeModel(
+            args.hf_model_name,
+            custom_vae=custom_vae,
+        )
     mod_str = export_vae_model(
         vae_model,
         args.hf_model_name,
         args.batch_size,
-        args.height,
-        args.width,
-        args.precision,
-        args.compile_to,
-        args.external_weights,
-        args.external_weight_path,
-        args.device,
-        args.iree_target_triple,
-        args.vulkan_max_allocation,
-        args.variant,
+        height=args.height,
+        width=args.width,
+        precision=args.precision,
+        compile_to=args.compile_to,
+        external_weights=args.external_weights,
+        external_weight_path=args.external_weight_path,
+        device=args.device,
+        target_triple=args.iree_target_triple,
+        ireec_flags=args.ireec_flags + args.attn_flags + args.vae_flags,
+        variant=args.vae_variant,
+        decomp_attn=args.decomp_attn,
+        attn_spec=args.attn_spec,
+        input_mlir=args.input_mlir,
+    )
+    if args.input_mlir or (args.compile_to == "vmfb"):
+        exit()
+    safe_name = utils.create_safe_name(
+        args.hf_model_name,
+        f"_bs{str(args.batch_size)}_{args.height}x{args.width}_{args.precision}_vae_{args.vae_variant}",
     )
-    safe_name = utils.create_safe_name(args.hf_model_name, "-vae")
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")
+    print("Saved to", safe_name + ".mlir")
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/sdxl_inference/clip.py b/models/turbine_models/custom_models/sdxl_inference/clip.py
index c5e583f86..2740745ed 100644
--- a/models/turbine_models/custom_models/sdxl_inference/clip.py
+++ b/models/turbine_models/custom_models/sdxl_inference/clip.py
@@ -48,7 +48,6 @@ def forward(self, input):
 def export_clip_model(
     hf_model_name,
     hf_auth_token=None,
-    batch_size=1,
     max_length=77,
     precision="fp16",
     compile_to="torch",
@@ -68,7 +67,7 @@ def export_clip_model(
         safe_name = os.path.join(pipeline_dir, "clip_" + str(index))
     else:
         safe_name = utils.create_safe_name(
-            hf_model_name, f"_bs{str(batch_size)}-{str(max_length)}-{precision}-clip-{index}-{device}"
+            hf_model_name, f"_{str(max_length)}-{precision}-clip-{index}-{device}"
         )
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -161,7 +160,6 @@ def main(self, inp=AbstractTensor(1, max_length, dtype=torch.int64)):
     mod_1_str, _ = export_clip_model(
         args.hf_model_name,
         args.hf_auth_token,
-        args.batch_size,
         args.max_length,
         args.precision,
         args.compile_to,
@@ -179,7 +177,6 @@ def main(self, inp=AbstractTensor(1, max_length, dtype=torch.int64)):
     mod_2_str, _ = export_clip_model(
         args.hf_model_name,
         args.hf_auth_token,
-        args.batch_size,
         args.max_length,
         args.precision,
         args.compile_to,
@@ -197,10 +194,10 @@ def main(self, inp=AbstractTensor(1, max_length, dtype=torch.int64)):
     if args.input_mlir:
         exit()
     safe_name_1 = safe_name = utils.create_safe_name(
-        args.hf_model_name, f"_bs{str(args.batch_size)}_{str(args.max_length)}_{args.precision}_clip_1"
+        args.hf_model_name, f"_{str(args.max_length)}_{args.precision}_clip_1"
     )
     safe_name_2 = safe_name = utils.create_safe_name(
-        args.hf_model_name, f"_bs{str(args.batch_size)}_{str(args.max_length)}_{args.precision}_clip_2"
+        args.hf_model_name, f"_{str(args.max_length)}_{args.precision}_clip_2"
     )
     with open(f"{safe_name_1}.mlir", "w+") as f:
         f.write(mod_1_str)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 6a4e6be43..fcd98be67 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -216,7 +216,6 @@ def encode_prompts(
     mod_str, _ = export_prompt_encoder(
         args.hf_model_name,
         args.hf_auth_token,
-        args.batch_size,
         args.max_length,
         args.precision,
         args.compile_to,

From 7e80dd7199b5cd6ea32a7488790591f09aaf2692 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 01:50:41 -0500
Subject: [PATCH 066/174] Updates to cpu scheduling fallback.

---
 .../custom_models/sd_inference/schedulers.py  | 65 ++++++++++++++++++-
 .../custom_models/sd_inference/sd_pipeline.py | 16 +++--
 2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 25f0950fd..cdafbcbe6 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -13,6 +13,22 @@
 import iree.runtime as ireert
 import numpy as np
 
+from diffusers import (
+    LCMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDPMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    KDPM2DiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DEISMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    HeunDiscreteScheduler,
+)
+
 from turbine_models.turbine_tank import turbine_tank
 from turbine_models.custom_models.sd_inference import utils
 from turbine_models.model_runner import vmfbRunner
@@ -78,7 +94,7 @@ def scale_model_input(self, sample, t):
     def step(self, sample, latents, t):
         return ireert.asdevicearray(self.dest, super.step(sample.to_host(), latents.to_host(), t.to_host()), self.dtype)
 
-def export_scheduler(
+def export_scheduler_model(
     hf_model_name: str,
     scheduler_id: str,
     batch_size: int = 1,
@@ -188,6 +204,53 @@ def step(
             exit()
         return vmfb
 
+# from shark_turbine.turbine_models.schedulers import export_scheduler_model
+
+
+
+def get_scheduler(model_id, scheduler_id):
+    # TODO: switch over to turbine and run all on GPU
+    print(f"\n[LOG] Initializing schedulers from model id: {model_id}")
+    schedulers = {}
+    for sched in SCHEDULER_MAP:
+        schedulers[sched] = SCHEDULER_MAP[sched].from_pretrained(model_id, subfolder="scheduler")
+    schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id, subfolder="scheduler", algorithm_type="dpmsolver"
+    )
+    schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
+    )
+    schedulers["DPMSolverMultistepKarras"] = (
+        DPMSolverMultistepScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+            use_karras_sigmas=True,
+        )
+    )
+    schedulers["DPMSolverMultistepKarras++"] = (
+        DPMSolverMultistepScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+            algorithm_type="dpmsolver++",
+            use_karras_sigmas=True,
+        )
+    )
+    return schedulers[scheduler_id]
+
+SCHEDULER_MAP = {
+    "PNDM": PNDMScheduler,
+    "DDPM": DDPMScheduler,
+    "KDPM2Discrete": KDPM2DiscreteScheduler,
+    "LMSDiscrete": LMSDiscreteScheduler,
+    "DDIM": DDIMScheduler,
+    "LCMScheduler": LCMScheduler,
+    "EulerDiscrete": EulerDiscreteScheduler,
+    "EulerAncestralDiscrete": EulerAncestralDiscreteScheduler,
+    "DEISMultistep": DEISMultistepScheduler,
+    "DPMSolverSinglestep": DPMSolverSinglestepScheduler,
+    "KDPM2AncestralDiscrete": KDPM2AncestralDiscreteScheduler,
+    "HeunDiscrete": HeunDiscreteScheduler,
+}
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index f47dda536..77bb33a24 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -137,7 +137,9 @@ def check_prepared(
     def is_prepared(self, vmfbs, weights):
         missing = []
         for key in vmfbs:
-            default_filepath = os.path.join(self.pipeline_dir, key + "_" + self.iree_target_triple + ".vmfb")
+            if "scheduler" in key and self.cpu_scheduling:
+                continue
+            default_filepath = os.path.join(self.pipeline_dir, key + ".vmfb")
             if vmfbs[key] is not None and os.path.exists(vmfbs[key]):
                 continue
             elif vmfbs[key] == None and os.path.exists(default_filepath):
@@ -258,7 +260,7 @@ def export_submodel(
                 return clip_vmfb, clip_external_weight_path
             case "scheduler":
                 if self.cpu_scheduling:
-                    return utils.get_scheduler(self.hf_model_name, self.scheduler_id), None
+                    return schedulers.get_scheduler(self.hf_model_name, self.scheduler_id), None
                 scheduler = schedulers.export_scheduler(
                     self.hf_model_name,
                     self.scheduler_id,
@@ -346,20 +348,20 @@ def load_pipeline(
         self.runners = {}
         runners = {}
         runners["tokenizers"] = []
-        runners["tokenizers"] += CLIPTokenizer.from_pretrained(
+        runners["tokenizers"].append(CLIPTokenizer.from_pretrained(
             self.hf_model_name,
             subfolder="tokenizer",
-        )
+        ))
         if self.is_sdxl:
-            runners["tokenizers"] += CLIPTokenizer.from_pretrained(
+            runners["tokenizers"].append(CLIPTokenizer.from_pretrained(
                 self.hf_model_name,
                 subfolder="tokenizer_2",
-            ),
+            ))
             
         runners["clip"] = vmfbRunner(
             rt_device, vmfbs["clip"], weights["clip"]
         )
-        if isinstance(vmfbs["scheduler"], torch.nn.Module):
+        if self.cpu_scheduling:
             self.scheduler = schedulers.SchedulingModel(vmfbs['scheduler'], self.height, self.width)
         else:
             self.scheduler = schedulers.SharkSchedulerWrapper(rt_device, vmfbs["scheduler"], weights["scheduler"])

From ed522272a0bd91d42d077ab082f50a4ac74efa6f Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 02:16:39 -0500
Subject: [PATCH 067/174] Fix arg parsers

---
 .../custom_models/sd_inference/clip_runner.py | 45 +---------------
 .../sd_inference/schedulers_runner.py         | 54 +------------------
 .../custom_models/sd_inference/unet.py        |  3 +-
 .../custom_models/sd_inference/unet_runner.py | 44 +--------------
 .../custom_models/sd_inference/vae.py         |  2 +-
 .../custom_models/sd_inference/vae_runner.py  | 41 +-------------
 6 files changed, 8 insertions(+), 181 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip_runner.py b/models/turbine_models/custom_models/sd_inference/clip_runner.py
index a4cf677cb..5a99471bf 100644
--- a/models/turbine_models/custom_models/sd_inference/clip_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/clip_runner.py
@@ -4,49 +4,6 @@
 from iree import runtime as ireert
 import torch
 
-parser = argparse.ArgumentParser()
-
-# TODO move common runner flags to generic flag file
-parser.add_argument(
-    "--vmfb_path", type=str, default="", help="path to vmfb containing compiled module"
-)
-parser.add_argument(
-    "--external_weight_path",
-    type=str,
-    default="",
-    help="path to external weight parameters if model compiled without them",
-)
-parser.add_argument(
-    "--compare_vs_torch",
-    action="store_true",
-    help="Runs both turbine vmfb and a torch model to compare results",
-)
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="CompVis/stable-diffusion-v1-4",
-)
-parser.add_argument(
-    "--hf_auth_token",
-    type=str,
-    help="The Hugging face auth token, required for some models",
-)
-parser.add_argument(
-    "--device",
-    type=str,
-    default="local-task",
-    help="local-sync, local-task, cuda, vulkan, rocm",
-)
-
-parser.add_argument(
-    "--prompt",
-    type=str,
-    default="a photograph of an astronaut riding a horse",
-    help="prompt for clip model",
-)
-
-
 def run_clip(
     device, prompt, vmfb_path, hf_model_name, hf_auth_token, external_weight_path
 ):
@@ -168,7 +125,7 @@ def run_torch_clip(hf_model_name, hf_auth_token, prompt):
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
     turbine_output = run_clip(
         args.device,
         args.prompt,
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers_runner.py b/models/turbine_models/custom_models/sd_inference/schedulers_runner.py
index 45663c0a6..23c60f179 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers_runner.py
@@ -4,65 +4,13 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import argparse
 from turbine_models.model_runner import vmfbRunner
 from iree import runtime as ireert
 import torch
 from diffusers import (
-    PNDMScheduler,
     UNet2DConditionModel,
 )
 
-parser = argparse.ArgumentParser()
-
-# TODO move common runner flags to generic flag file
-parser.add_argument(
-    "--scheduler_id",
-    type=str,
-    help="Scheduler ID",
-    default="PNDM",
-)
-parser.add_argument(
-    "--num_inference_steps", type=int, default=50, help="Number of inference steps"
-)
-parser.add_argument(
-    "--vmfb_path", type=str, default="", help="path to vmfb containing compiled module"
-)
-parser.add_argument(
-    "--external_weight_path",
-    type=str,
-    default="",
-    help="path to external weight parameters if model compiled without them",
-)
-parser.add_argument(
-    "--compare_vs_torch",
-    action="store_true",
-    help="Runs both turbine vmfb and a torch model to compare results",
-)
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="stabilityai/stable-diffusion-xl-base-1.0",
-)
-parser.add_argument(
-    "--hf_auth_token",
-    type=str,
-    help="The Hugging face auth token, required for some models",
-)
-parser.add_argument(
-    "--device",
-    type=str,
-    default="local-task",
-    help="local-sync, local-task, cuda, vulkan, rocm",
-)
-parser.add_argument(
-    "--batch_size", type=int, default=1, help="Batch size for inference"
-)
-parser.add_argument(
-    "--height", type=int, default=1024, help="Height of Stable Diffusion"
-)
-parser.add_argument("--width", type=int, default=1024, help="Width of Stable Diffusion")
 
 
 def run_scheduler(
@@ -197,7 +145,7 @@ def forward(self, sample, prompt_embeds, text_embeds, time_ids):
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
     sample = torch.rand(
         args.batch_size, 4, args.height // 8, args.width // 8, dtype=torch.float32
     )
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 3d3137823..1df7dd2c4 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -168,7 +168,8 @@ def main(
 
 
 if __name__ == "__main__":
-    from .sd_cmd_opts import args
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
     if args.input_mlir:
         unet_model = None
     else:
diff --git a/models/turbine_models/custom_models/sd_inference/unet_runner.py b/models/turbine_models/custom_models/sd_inference/unet_runner.py
index 1b8c5d101..fb2f40782 100644
--- a/models/turbine_models/custom_models/sd_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/unet_runner.py
@@ -4,48 +4,6 @@
 from iree import runtime as ireert
 import torch
 
-parser = argparse.ArgumentParser()
-
-# TODO move common runner flags to generic flag file
-parser.add_argument(
-    "--vmfb_path", type=str, default="", help="path to vmfb containing compiled module"
-)
-parser.add_argument(
-    "--external_weight_path",
-    type=str,
-    default="",
-    help="path to external weight parameters if model compiled without them",
-)
-parser.add_argument(
-    "--compare_vs_torch",
-    action="store_true",
-    help="Runs both turbine vmfb and a torch model to compare results",
-)
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="CompVis/stable-diffusion-v1-4",
-)
-parser.add_argument(
-    "--hf_auth_token",
-    type=str,
-    help="The Hugging face auth token, required for some models",
-)
-parser.add_argument(
-    "--device",
-    type=str,
-    default="local-task",
-    help="local-sync, local-task, cuda, vulkan, rocm",
-)
-parser.add_argument(
-    "--batch_size", type=int, default=1, help="Batch size for inference"
-)
-parser.add_argument(
-    "--height", type=int, default=512, help="Height of Stable Diffusion"
-)
-parser.add_argument("--width", type=int, default=512, help="Width of Stable Diffusion")
-
 
 def run_unet(
     device,
@@ -145,6 +103,8 @@ def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
     if args.compare_vs_torch:
         print("generating torch output: ")
         from turbine_models.custom_models.sd_inference import utils
+        from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
 
         torch_output = run_torch_unet(
             args.hf_model_name,
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 2286ebdf2..d4b6fe094 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -153,7 +153,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
 
 
 if __name__ == "__main__":
-    from .sd_cmd_opts import args
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
     if args.input_mlir:
         vae_model = None
     else:
diff --git a/models/turbine_models/custom_models/sd_inference/vae_runner.py b/models/turbine_models/custom_models/sd_inference/vae_runner.py
index cce53c118..4b561f647 100644
--- a/models/turbine_models/custom_models/sd_inference/vae_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/vae_runner.py
@@ -4,45 +4,6 @@
 from iree import runtime as ireert
 import torch
 
-parser = argparse.ArgumentParser()
-
-# TODO move common runner flags to generic flag file
-parser.add_argument(
-    "--vmfb_path", type=str, default="", help="path to vmfb containing compiled module"
-)
-parser.add_argument(
-    "--external_weight_path",
-    type=str,
-    default="",
-    help="path to external weight parameters if model compiled without them",
-)
-parser.add_argument(
-    "--compare_vs_torch",
-    action="store_true",
-    help="Runs both turbine vmfb and a torch model to compare results",
-)
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="CompVis/stable-diffusion-v1-4",
-)
-parser.add_argument(
-    "--device",
-    type=str,
-    default="local-task",
-    help="local-sync, local-task, cuda, vulkan, rocm",
-)
-parser.add_argument(
-    "--batch_size", type=int, default=1, help="Batch size for inference"
-)
-parser.add_argument(
-    "--height", type=int, default=512, help="Height of Stable Diffusion"
-)
-parser.add_argument("--width", type=int, default=512, help="Width of Stable Diffusion")
-parser.add_argument("--variant", type=str, default="decode")
-
-
 def run_vae(device, example_input, vmfb_path, hf_model_name, external_weight_path):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
 
@@ -114,7 +75,7 @@ def encode_inp(self, inp):
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
     if args.variant == "decode":
         example_input = torch.rand(
             args.batch_size, 4, args.height // 8, args.width // 8, dtype=torch.float32

From 7d30d31f759dc553446a631631ab25afdd944c62 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 11:07:29 -0500
Subject: [PATCH 068/174] Formatting.

---
 .../custom_models/sd_inference/clip.py        |  5 +-
 .../custom_models/sd_inference/clip_runner.py |  2 +
 .../custom_models/sd_inference/schedulers.py  | 68 +++++++-------
 .../sd_inference/schedulers_runner.py         |  2 +-
 .../custom_models/sd_inference/sd_pipeline.py | 89 +++++++++++--------
 .../sd_inference/tokenization.py              | 40 +++++----
 .../custom_models/sd_inference/unet.py        |  6 +-
 .../custom_models/sd_inference/unet_runner.py |  1 -
 .../custom_models/sd_inference/utils.py       | 14 +--
 .../custom_models/sd_inference/vae.py         |  8 +-
 .../custom_models/sd_inference/vae_runner.py  |  2 +
 .../custom_models/sdxl_inference/unet.py      |  3 +-
 .../custom_models/sdxl_inference/vae.py       |  3 +-
 13 files changed, 138 insertions(+), 105 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index 523f8a986..ef69e8a6d 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -118,7 +118,7 @@ def export_clip(
 
     if weights_only:
         return external_weight_path
-    
+
     if "google/t5" in hf_model_name:
 
         class CompiledClip(CompiledModule):
@@ -176,6 +176,7 @@ def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
         )
         return None, vmfb_path
 
+
 if __name__ == "__main__":
     from .sd_cmd_opts import args
 
@@ -203,4 +204,4 @@ def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
     )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")
\ No newline at end of file
+    print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/clip_runner.py b/models/turbine_models/custom_models/sd_inference/clip_runner.py
index 5a99471bf..fe5310ff6 100644
--- a/models/turbine_models/custom_models/sd_inference/clip_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/clip_runner.py
@@ -4,6 +4,7 @@
 from iree import runtime as ireert
 import torch
 
+
 def run_clip(
     device, prompt, vmfb_path, hf_model_name, hf_auth_token, external_weight_path
 ):
@@ -126,6 +127,7 @@ def run_torch_clip(hf_model_name, hf_auth_token, prompt):
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
     turbine_output = run_clip(
         args.device,
         args.prompt,
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index cdafbcbe6..02d2f5d87 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -33,18 +33,17 @@
 from turbine_models.custom_models.sd_inference import utils
 from turbine_models.model_runner import vmfbRunner
 
-class SharkSchedulerWrapper():
+
+class SharkSchedulerWrapper:
     def __init__(self, rt_device, vmfb, weights):
-        self.runner = vmfbRunner(
-            rt_device, vmfb, weights
-        )
-    
+        self.runner = vmfbRunner(rt_device, vmfb, weights)
+
     def initialize(self, sample):
         return self.runner.ctx.modules.scheduler["initialize"](sample)
-    
+
     def scale_model_input(self, sample, t):
         return self.runner.ctx.modules.scheduler["scale_model_input"](sample, t)
-    
+
     def step(self, sample, latents, t):
         return self.runner.ctx.modules.scheduler["step"](sample, latents, t)
 
@@ -69,30 +68,38 @@ def initialize(self, sample):
         step_indexes = torch.tensor(len(timesteps))
         sample = sample * self.model.init_noise_sigma
         return sample.type(self.dtype), add_time_ids, step_indexes
-    
+
     def scale_model_input(self, sample, t):
         self.model.scale_model_input(sample, t)
 
     def step(self, sample, latents, t):
         self.model.step(self, sample, latents, t)
 
+
 class SharkSchedulerCPUWrapper(SchedulingModel):
     def __init__(self, pipe, scheduler, height, width):
         super().__init__(scheduler, height, width)
         self.dest = pipe.runner["unet"].config.device
         self.dtype = pipe.iree_dtype
-    
+
     def initialize(self, sample):
         for output in super().initialize(sample):
             iree_arrays = ireert.asdevicearray(self.dest, output, self.dtype)
-        
+
         return iree_arrays
-    
+
     def scale_model_input(self, sample, t):
-        return ireert.asdevicearray(self.dest, super.scale_model_input(sample, t), self.dtype)
-    
+        return ireert.asdevicearray(
+            self.dest, super.scale_model_input(sample, t), self.dtype
+        )
+
     def step(self, sample, latents, t):
-        return ireert.asdevicearray(self.dest, super.step(sample.to_host(), latents.to_host(), t.to_host()), self.dtype)
+        return ireert.asdevicearray(
+            self.dest,
+            super.step(sample.to_host(), latents.to_host(), t.to_host()),
+            self.dtype,
+        )
+
 
 def export_scheduler_model(
     hf_model_name: str,
@@ -113,9 +120,7 @@ def export_scheduler_model(
 ):
     schedulers = utils.get_schedulers(hf_model_name)
     scheduler = schedulers[scheduler_id]
-    scheduler_module = SchedulingModel(
-        hf_model_name, scheduler
-    )
+    scheduler_module = SchedulingModel(hf_model_name, scheduler)
     vmfb_name = (
         scheduler_id
         + "_"
@@ -124,17 +129,12 @@ def export_scheduler_model(
         + precision
         + "_"
         + str(num_inference_steps),
-        + "_"
-        + target_triple
+        +"_" + target_triple,
     )
     if pipeline_dir:
-        safe_name = os.path.join(
-            pipeline_dir, vmfb_name
-        )
+        safe_name = os.path.join(pipeline_dir, vmfb_name)
     else:
-        safe_name = utils.create_safe_name(
-            hf_model_name, vmfb_name
-        )
+        safe_name = utils.create_safe_name(hf_model_name, vmfb_name)
 
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -147,7 +147,7 @@ def export_scheduler_model(
             return_path=not exit_on_vmfb,
         )
         return vmfb_path
-   
+
     dtype = torch.float16 if precision == "fp16" else torch.float32
 
     if precision == "fp16":
@@ -168,14 +168,14 @@ def initialize(
             sample=AbstractTensor(*sample, dtype=dtype),
         ):
             return jittable(scheduler_module.initialize)(sample)
-        
+
         def scale_model_input(
             self,
             sample=AbstractTensor(*sample, dtype=dtype),
             t=AbstractTensor(1, dtype=dtype),
         ):
             return jittable(scheduler_module.scale_model_input)(sample, t)
-        
+
         def step(
             self,
             sample=AbstractTensor(*sample, dtype=dtype),
@@ -204,8 +204,8 @@ def step(
             exit()
         return vmfb
 
-# from shark_turbine.turbine_models.schedulers import export_scheduler_model
 
+# from shark_turbine.turbine_models.schedulers import export_scheduler_model
 
 
 def get_scheduler(model_id, scheduler_id):
@@ -213,7 +213,9 @@ def get_scheduler(model_id, scheduler_id):
     print(f"\n[LOG] Initializing schedulers from model id: {model_id}")
     schedulers = {}
     for sched in SCHEDULER_MAP:
-        schedulers[sched] = SCHEDULER_MAP[sched].from_pretrained(model_id, subfolder="scheduler")
+        schedulers[sched] = SCHEDULER_MAP[sched].from_pretrained(
+            model_id, subfolder="scheduler"
+        )
     schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
         model_id, subfolder="scheduler", algorithm_type="dpmsolver"
     )
@@ -237,6 +239,7 @@ def get_scheduler(model_id, scheduler_id):
     )
     return schedulers[scheduler_id]
 
+
 SCHEDULER_MAP = {
     "PNDM": PNDMScheduler,
     "DDPM": DDPMScheduler,
@@ -270,7 +273,10 @@ def get_scheduler(model_id, scheduler_id):
         exit_on_vmfb=False,
         input_mlir=args.input_mlir,
     )
-    safe_name = utils.create_safe_name(args.hf_model_name, "_" + args.scheduler_id + "_" + str(args.num_inference_steps))
+    safe_name = utils.create_safe_name(
+        args.hf_model_name,
+        "_" + args.scheduler_id + "_" + str(args.num_inference_steps),
+    )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
     print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers_runner.py b/models/turbine_models/custom_models/sd_inference/schedulers_runner.py
index 23c60f179..54b9c47f1 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers_runner.py
@@ -12,7 +12,6 @@
 )
 
 
-
 def run_scheduler(
     device,
     sample,
@@ -146,6 +145,7 @@ def forward(self, sample, prompt_embeds, text_embeds, time_ids):
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
     sample = torch.rand(
         args.batch_size, 4, args.height // 8, args.width // 8, dtype=torch.float32
     )
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 77bb33a24..473b6836c 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -50,6 +50,7 @@
     "vae_decode": None,
 }
 
+
 class SharkSDPipeline:
     def __init__(
         self,
@@ -260,7 +261,10 @@ def export_submodel(
                 return clip_vmfb, clip_external_weight_path
             case "scheduler":
                 if self.cpu_scheduling:
-                    return schedulers.get_scheduler(self.hf_model_name, self.scheduler_id), None
+                    return (
+                        schedulers.get_scheduler(self.hf_model_name, self.scheduler_id),
+                        None,
+                    )
                 scheduler = schedulers.export_scheduler(
                     self.hf_model_name,
                     self.scheduler_id,
@@ -283,7 +287,7 @@ def export_submodel(
                     unet_torch = None
                 else:
                     unet_torch = self.get_torch_models("unet")
-                    
+
                 unet_vmfb = unet.export_unet_model(
                     unet_torch,
                     self.hf_model_name,
@@ -335,7 +339,6 @@ def export_submodel(
                 )
                 return vae_decode_vmfb, vae_external_weight_path
 
-
     # LOAD
 
     def load_pipeline(
@@ -348,27 +351,31 @@ def load_pipeline(
         self.runners = {}
         runners = {}
         runners["tokenizers"] = []
-        runners["tokenizers"].append(CLIPTokenizer.from_pretrained(
-            self.hf_model_name,
-            subfolder="tokenizer",
-        ))
-        if self.is_sdxl:
-            runners["tokenizers"].append(CLIPTokenizer.from_pretrained(
+        runners["tokenizers"].append(
+            CLIPTokenizer.from_pretrained(
                 self.hf_model_name,
-                subfolder="tokenizer_2",
-            ))
-            
-        runners["clip"] = vmfbRunner(
-            rt_device, vmfbs["clip"], weights["clip"]
+                subfolder="tokenizer",
+            )
         )
+        if self.is_sdxl:
+            runners["tokenizers"].append(
+                CLIPTokenizer.from_pretrained(
+                    self.hf_model_name,
+                    subfolder="tokenizer_2",
+                )
+            )
+
+        runners["clip"] = vmfbRunner(rt_device, vmfbs["clip"], weights["clip"])
         if self.cpu_scheduling:
-            self.scheduler = schedulers.SchedulingModel(vmfbs['scheduler'], self.height, self.width)
+            self.scheduler = schedulers.SchedulingModel(
+                vmfbs["scheduler"], self.height, self.width
+            )
         else:
-            self.scheduler = schedulers.SharkSchedulerWrapper(rt_device, vmfbs["scheduler"], weights["scheduler"])
+            self.scheduler = schedulers.SharkSchedulerWrapper(
+                rt_device, vmfbs["scheduler"], weights["scheduler"]
+            )
 
-        runners["unet"] = vmfbRunner(
-            rt_device, vmfbs["unet"], weights["unet"]
-        )
+        runners["unet"] = vmfbRunner(rt_device, vmfbs["unet"], weights["unet"])
         runners["vae_decode"] = vmfbRunner(
             rt_device, vmfbs["vae_decode"], weights["vae_decode"]
         )
@@ -411,7 +418,9 @@ def generate_images(
             )
             samples.append(
                 ireert.asdevicearray(
-                    self.runners["unet"].config.device, rand_sample, dtype=self.iree_dtype
+                    self.runners["unet"].config.device,
+                    rand_sample,
+                    dtype=self.iree_dtype,
                 )
             )
 
@@ -425,13 +434,15 @@ def generate_images(
 
         # Tokenize prompt and negative prompt.
 
-        prompt_embeds, negative_embeds = get_weighted_text_embeddings(self, prompt, negative_prompt)
+        prompt_embeds, negative_embeds = get_weighted_text_embeddings(
+            self, prompt, negative_prompt
+        )
 
         encode_prompts_end = time.time()
 
         for i in range(batch_count):
             unet_start = time.time()
-        
+
             sample, add_time_ids, timesteps = self.scheduler.initialize(samples[i])
 
             if self.is_img2img:
@@ -441,44 +452,47 @@ def generate_images(
                 t_start = max(num_inference_steps - init_timestep, 0)
                 timesteps = self.scheduler.timesteps[t_start:]
                 latents = self.encode_image(image)
-                latents = self.scheduler.add_noise(latents, noise, timesteps[0].repeat(1))
+                latents = self.scheduler.add_noise(
+                    latents, noise, timesteps[0].repeat(1)
+                )
                 return latents, [timesteps]
 
             if self.cpu_scheduling:
                 sample = ireert.asdevicearray(
                     self.runners["unet"].config.device,
                     np.asarray(sample),
-                    dtype=self.iree_dtype
+                    dtype=self.iree_dtype,
                 )
                 add_time_ids = ireert.asdevicearray(
                     self.runners["unet"].config.device,
                     np.asarray(add_time_ids),
-                    dtype=self.iree_dtype
+                    dtype=self.iree_dtype,
                 )
                 timesteps = ireert.asdevicearray(
                     self.runners["unet"].config.device,
                     np.asarray(timesteps),
-                    dtype=self.iree_dtype
+                    dtype=self.iree_dtype,
                 )
 
             for t in range(timesteps):
-                latents = self.scheduler.scale_model_input(
-                    sample, t
-                )
+                latents = self.scheduler.scale_model_input(sample, t)
                 latents = self.runners["unet"].ctx.modules.compiled_unet["main"](
-                    latents, prompt_embeds, negative_embeds, add_time_ids, guidance_scale, t
-                )
-                sample = self.scheduler.step(
-                    sample, latents, t
+                    latents,
+                    prompt_embeds,
+                    negative_embeds,
+                    add_time_ids,
+                    guidance_scale,
+                    t,
                 )
-            
+                sample = self.scheduler.step(sample, latents, t)
+
             if self.cpu_scheduling:
                 sample = ireert.asdevicearray(
                     self.runners["vae_decode"].config.device,
                     np.asarray(sample),
-                    dtype=self.iree_dtype
+                    dtype=self.iree_dtype,
                 )
-            
+
             vae_start = time.time()
             vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
                 sample
@@ -505,8 +519,7 @@ def generate_images(
             print("VAE time: ", pipe_end - vae_start, "sec")
             print(
                 f"\nTotal time (txt2img, batch #{str(i+1)}): ",
-                (encode_prompts_end - tokenize_start)
-                + (pipe_end - unet_start),
+                (encode_prompts_end - tokenize_start) + (pipe_end - unet_start),
                 "sec\n",
             )
         end = time.time()
diff --git a/models/turbine_models/custom_models/sd_inference/tokenization.py b/models/turbine_models/custom_models/sd_inference/tokenization.py
index 83bdcb881..18056488a 100644
--- a/models/turbine_models/custom_models/sd_inference/tokenization.py
+++ b/models/turbine_models/custom_models/sd_inference/tokenization.py
@@ -210,10 +210,14 @@ def get_unweighted_text_embeddings(
             text_input_chunk[:, 0] = text_input[0, 0]
             text_input_chunk[:, -1] = text_input[0, -1]
 
-            text_input_chunk = ireert.asdevicearray(pipe.runners["clip"].config.device, text_input_chunk, pipe.iree_dtype)
-            text_embedding = pipe.runners["clip"].ctx.modules.compiled_clip["encode_prompts"](
-            text_input_chunk
-            ).to_host()
+            text_input_chunk = ireert.asdevicearray(
+                pipe.runners["clip"].config.device, text_input_chunk, pipe.iree_dtype
+            )
+            text_embedding = (
+                pipe.runners["clip"]
+                .ctx.modules.compiled_clip["encode_prompts"](text_input_chunk)
+                .to_host()
+            )
             if no_boseos_middle:
                 if i == 0:
                     # discard the ending token
@@ -241,6 +245,7 @@ def get_unweighted_text_embeddings(
 def filter_nonetype_tokens(tokens: List[List]):
     return [[49407 if token is None else token for token in tokens[0]]]
 
+
 def get_tokenized_inputs(
     pipe,
     tokenizer,
@@ -330,6 +335,7 @@ def get_tokenized_inputs(
     else:
         return prompt_tokens, prompt_weights, None, None
 
+
 def get_weighted_text_embeddings(
     pipe,
     prompt: List[str],
@@ -340,17 +346,19 @@ def get_weighted_text_embeddings(
     skip_weighting: Optional[bool] = False,
 ):
     max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
-    for tokenizer in pipe.runners['tokenizers']:
-        prompt_tokens, prompt_weights, uncond_tokens, uncond_weights = get_tokenized_inputs(
-            pipe,
-            tokenizer,
-            prompt,
-            uncond_prompt,
-            max_length,
-            max_embeddings_multiples,
-            no_boseos_middle,
-            skip_parsing,
-            skip_weighting
+    for tokenizer in pipe.runners["tokenizers"]:
+        prompt_tokens, prompt_weights, uncond_tokens, uncond_weights = (
+            get_tokenized_inputs(
+                pipe,
+                tokenizer,
+                prompt,
+                uncond_prompt,
+                max_length,
+                max_embeddings_multiples,
+                no_boseos_middle,
+                skip_parsing,
+                skip_weighting,
+            )
         )
 
     # get the embeddings
@@ -401,4 +409,4 @@ def get_weighted_text_embeddings(
 
     if uncond_prompt is not None:
         return text_embeddings, uncond_embeddings
-    return text_embeddings, None
\ No newline at end of file
+    return text_embeddings, None
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 1df7dd2c4..86facb772 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -25,7 +25,6 @@
 from turbine_models.turbine_tank import turbine_tank
 
 
-
 class UnetModel(torch.nn.Module):
     def __init__(self, hf_model_name):
         super().__init__()
@@ -45,6 +44,7 @@ def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
         )
         return noise_pred
 
+
 def export_unet_model(
     unet_model,
     hf_model_name,
@@ -72,9 +72,7 @@ def export_unet_model(
     else:
         do_classifier_free_guidance = True
     if pipeline_dir:
-        safe_name = os.path.join(
-            pipeline_dir, f"unet"
-        )
+        safe_name = os.path.join(pipeline_dir, f"unet")
     else:
         safe_name = utils.create_safe_name(
             hf_model_name,
diff --git a/models/turbine_models/custom_models/sd_inference/unet_runner.py b/models/turbine_models/custom_models/sd_inference/unet_runner.py
index fb2f40782..172229e77 100644
--- a/models/turbine_models/custom_models/sd_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/unet_runner.py
@@ -105,7 +105,6 @@ def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
         from turbine_models.custom_models.sd_inference import utils
         from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
 
-
         torch_output = run_torch_unet(
             args.hf_model_name,
             args.hf_auth_token,
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 35245100d..9593b46a9 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -19,7 +19,7 @@
         "--iree-opt-data-tiling=false",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-rocm-waves-per-eu=2",
-        "--iree-codegen-llvmgpu-use-vector-distribution=true", 
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
     "unet": [
@@ -31,7 +31,7 @@
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
-        "--iree-codegen-llvmgpu-use-vector-distribution=true"
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
     ],
     "vae": ["--iree-flow-enable-aggressive-fusion"],
 }
@@ -215,10 +215,10 @@ def get_schedulers(model_id):
         model_id,
         subfolder="scheduler",
     )
-    schedulers[
-        "EulerAncestralDiscrete"
-    ] = EulerAncestralDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
+    schedulers["EulerAncestralDiscrete"] = (
+        EulerAncestralDiscreteScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+        )
     )
     return schedulers
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index d4b6fe094..a960cc115 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -21,6 +21,7 @@
 import argparse
 from turbine_models.turbine_tank import turbine_tank
 
+
 class VaeModel(torch.nn.Module):
     def __init__(
         self,
@@ -89,7 +90,8 @@ def export_vae_model(
         safe_name = os.path.join(pipeline_dir, "vae_" + variant)
     else:
         safe_name = utils.create_safe_name(
-            hf_model_name, f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}"
+            hf_model_name,
+            f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}",
         )
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -151,9 +153,9 @@ def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
         return vmfb_path
 
 
-
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
     if args.input_mlir:
         vae_model = None
     else:
@@ -187,4 +189,4 @@ def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
     )
     with open(f"{safe_name}.mlir", "w+") as f:
         f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")
\ No newline at end of file
+    print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/vae_runner.py b/models/turbine_models/custom_models/sd_inference/vae_runner.py
index 4b561f647..cded33824 100644
--- a/models/turbine_models/custom_models/sd_inference/vae_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/vae_runner.py
@@ -4,6 +4,7 @@
 from iree import runtime as ireert
 import torch
 
+
 def run_vae(device, example_input, vmfb_path, hf_model_name, external_weight_path):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
 
@@ -76,6 +77,7 @@ def encode_inp(self, inp):
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
     if args.variant == "decode":
         example_input = torch.rand(
             args.batch_size, 4, args.height // 8, args.width // 8, dtype=torch.float32
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index ca1781fa6..ef3db6212 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -107,7 +107,8 @@ def export_unet_model(
         do_classifier_free_guidance = True
 
     safe_name = utils.create_safe_name(
-        hf_model_name, f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}"
+        hf_model_name,
+        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}",
     )
 
     if input_mlir:
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index 5f7726dc8..6b21645e7 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -89,7 +89,8 @@ def export_vae_model(
         safe_name = os.path.join(pipeline_dir, "vae_" + variant)
     else:
         safe_name = utils.create_safe_name(
-            hf_model_name, f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}"
+            hf_model_name,
+            f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}",
         )
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(

From f6e1d8e93eaec30d2302dfe96a5fc10e89817020 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 11:18:52 -0500
Subject: [PATCH 069/174] Fix scheduler api in test.

---
 .../custom_models/sd_inference/schedulers.py  | 11 +++++---
 models/turbine_models/tests/sd_test.py        | 28 +++++++++----------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 02d2f5d87..0d1c32168 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -49,10 +49,12 @@ def step(self, sample, latents, t):
 
 
 class SchedulingModel(torch.nn.Module):
-    def __init__(self, scheduler, height, width):
+    def __init__(self, scheduler, height, width, num_inference_steps):
         self.model = scheduler
         self.height = height
         self.width = width
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
 
     def initialize(self, sample):
         height = sample.shape[-2] * 8
@@ -118,9 +120,10 @@ def export_scheduler_model(
     input_mlir: str = None,
     upload_ir=False,
 ):
-    schedulers = utils.get_schedulers(hf_model_name)
-    scheduler = schedulers[scheduler_id]
-    scheduler_module = SchedulingModel(hf_model_name, scheduler)
+    scheduler = get_scheduler(hf_model_name, scheduler_id)
+    scheduler_module = SchedulingModel(
+        hf_model_name, scheduler, height, width, num_inference_steps
+    )
     vmfb_name = (
         scheduler_id
         + "_"
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index 76c11bcba..a4339e477 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -64,13 +64,14 @@
     custom_vae=None,
 )
 
-schedulers_dict = utils.get_schedulers(
-    # This is a public model, so no auth required
-    "CompVis/stable-diffusion-v1-4",
+scheduler = schedulers.get_scheduler(
+    default_arguments["hf_model_name"], default_arguments["scheduler_id"]
 )
-scheduler = schedulers_dict[default_arguments["scheduler_id"]]
-scheduler_module = schedulers.Scheduler(
-    "CompVis/stable-diffusion-v1-4", default_arguments["num_inference_steps"], scheduler
+scheduler_module = schedulers.SchedulingModel(
+    scheduler,
+    default_arguments["height"],
+    default_arguments["width"],
+    default_arguments["num_inference_steps"],
 )
 
 
@@ -357,18 +358,17 @@ def testExportVaeModelEncode(self):
     def testExportPNDMScheduler(self):
         current_args = copy.deepcopy(default_arguments)
         safe_name = "stable_diffusion_v1_4_scheduler"
-        blob_name = schedulers.export_scheduler(
-            scheduler_module,
-            # This is a public model, so no auth required
-            "CompVis/stable-diffusion-v1-4",
+        blob_name = schedulers.export_scheduler_model(
+            current_args["hf_model_name"],
+            current_args["scheduler_id"],
             current_args["batch_size"],
             current_args["height"],
             current_args["width"],
-            None,
+            current_args["num_inference_steps"],
+            current_args["precision"],
             "vmfb",
-            "safetensors",
-            "stable_diffusion_v1_4_scheduler.safetensors",
-            "cpu",
+            current_args["device"],
+            current_args["iree_target_triple"],
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = safe_name + ".safetensors"

From c6767fd55d625530c0683e05dde9806c2297fd1e Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 11:25:25 -0500
Subject: [PATCH 070/174] Black version fix.

---
 .../custom_models/sd_inference/schedulers.py  | 26 +++++++++---------
 .../sd_inference/tokenization.py              | 27 ++++++++++---------
 .../custom_models/sd_inference/utils.py       | 10 +++----
 3 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 0d1c32168..52fd0a8ae 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -225,20 +225,20 @@ def get_scheduler(model_id, scheduler_id):
     schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
         model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
     )
-    schedulers["DPMSolverMultistepKarras"] = (
-        DPMSolverMultistepScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-            use_karras_sigmas=True,
-        )
+    schedulers[
+        "DPMSolverMultistepKarras"
+    ] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+        use_karras_sigmas=True,
     )
-    schedulers["DPMSolverMultistepKarras++"] = (
-        DPMSolverMultistepScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-            algorithm_type="dpmsolver++",
-            use_karras_sigmas=True,
-        )
+    schedulers[
+        "DPMSolverMultistepKarras++"
+    ] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+        algorithm_type="dpmsolver++",
+        use_karras_sigmas=True,
     )
     return schedulers[scheduler_id]
 
diff --git a/models/turbine_models/custom_models/sd_inference/tokenization.py b/models/turbine_models/custom_models/sd_inference/tokenization.py
index 18056488a..18d88da1f 100644
--- a/models/turbine_models/custom_models/sd_inference/tokenization.py
+++ b/models/turbine_models/custom_models/sd_inference/tokenization.py
@@ -347,18 +347,21 @@ def get_weighted_text_embeddings(
 ):
     max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
     for tokenizer in pipe.runners["tokenizers"]:
-        prompt_tokens, prompt_weights, uncond_tokens, uncond_weights = (
-            get_tokenized_inputs(
-                pipe,
-                tokenizer,
-                prompt,
-                uncond_prompt,
-                max_length,
-                max_embeddings_multiples,
-                no_boseos_middle,
-                skip_parsing,
-                skip_weighting,
-            )
+        (
+            prompt_tokens,
+            prompt_weights,
+            uncond_tokens,
+            uncond_weights,
+        ) = get_tokenized_inputs(
+            pipe,
+            tokenizer,
+            prompt,
+            uncond_prompt,
+            max_length,
+            max_embeddings_multiples,
+            no_boseos_middle,
+            skip_parsing,
+            skip_weighting,
         )
 
     # get the embeddings
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 9593b46a9..3a12c40e8 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -215,10 +215,10 @@ def get_schedulers(model_id):
         model_id,
         subfolder="scheduler",
     )
-    schedulers["EulerAncestralDiscrete"] = (
-        EulerAncestralDiscreteScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-        )
+    schedulers[
+        "EulerAncestralDiscrete"
+    ] = EulerAncestralDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
     )
     return schedulers

From 93ab363b9b917c6d456c75020a4d28b59849f96a Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 11:45:11 -0500
Subject: [PATCH 071/174] add scipy and fix scheduler init

---
 models/requirements.txt                                      | 1 +
 .../turbine_models/custom_models/sd_inference/schedulers.py  | 4 ++--
 .../turbine_models/custom_models/sd_inference/sd_pipeline.py | 5 ++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index 899a016f8..b140ea65c 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -9,3 +9,4 @@ azure-storage-blob
 # microsoft/phi model
 einops
 pytest
+scipy
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 52fd0a8ae..c7ecc0d20 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -53,8 +53,8 @@ def __init__(self, scheduler, height, width, num_inference_steps):
         self.model = scheduler
         self.height = height
         self.width = width
-        self.scheduler.set_timesteps(num_inference_steps)
-        self.scheduler.is_scale_input_called = True
+        self.model.set_timesteps(num_inference_steps)
+        self.model.is_scale_input_called = True
 
     def initialize(self, sample):
         height = sample.shape[-2] * 8
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 473b6836c..26a55698c 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -368,7 +368,10 @@ def load_pipeline(
         runners["clip"] = vmfbRunner(rt_device, vmfbs["clip"], weights["clip"])
         if self.cpu_scheduling:
             self.scheduler = schedulers.SchedulingModel(
-                vmfbs["scheduler"], self.height, self.width
+                vmfbs["scheduler"],
+                self.height,
+                self.width,
+                self.num_inference_steps,
             )
         else:
             self.scheduler = schedulers.SharkSchedulerWrapper(

From c8014fcdb93a9280f1cdbce7431a816bef0ad397 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 12:07:35 -0500
Subject: [PATCH 072/174] Small fixes

---
 .../custom_models/sd_inference/sd_pipeline.py      | 14 ++++++--------
 .../custom_models/sd_inference/utils.py            |  2 +-
 .../sdxl_inference/sdxl_compiled_pipeline.py       |  2 ++
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 26a55698c..e0239e4d5 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -261,10 +261,7 @@ def export_submodel(
                 return clip_vmfb, clip_external_weight_path
             case "scheduler":
                 if self.cpu_scheduling:
-                    return (
-                        schedulers.get_scheduler(self.hf_model_name, self.scheduler_id),
-                        None,
-                    )
+                    return (None, None)
                 scheduler = schedulers.export_scheduler(
                     self.hf_model_name,
                     self.scheduler_id,
@@ -368,7 +365,7 @@ def load_pipeline(
         runners["clip"] = vmfbRunner(rt_device, vmfbs["clip"], weights["clip"])
         if self.cpu_scheduling:
             self.scheduler = schedulers.SchedulingModel(
-                vmfbs["scheduler"],
+                schedulers.get_scheduler(self.hf_model_name, self.scheduler_id),
                 self.height,
                 self.width,
                 self.num_inference_steps,
@@ -449,14 +446,15 @@ def generate_images(
             sample, add_time_ids, timesteps = self.scheduler.initialize(samples[i])
 
             if self.is_img2img:
+                strength = 0.5  # should be user-facing
                 init_timestep = min(
-                    int(num_inference_steps * strength), num_inference_steps
+                    int(self.num_inference_steps * strength), self.num_inference_steps
                 )
-                t_start = max(num_inference_steps - init_timestep, 0)
+                t_start = max(self.num_inference_steps - init_timestep, 0)
                 timesteps = self.scheduler.timesteps[t_start:]
                 latents = self.encode_image(image)
                 latents = self.scheduler.add_noise(
-                    latents, noise, timesteps[0].repeat(1)
+                    latents, sample, timesteps[0].repeat(1)
                 )
                 return latents, [timesteps]
 
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 3a12c40e8..1c77541c7 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -69,7 +69,7 @@ def compile_to_vmfb(
             ]
         )
         device = "llvm-cpu"
-    elif device == "vulkan":
+    elif device in ["vulkan", "vulkan-spirv"]:
         flags.extend(
             [
                 "--iree-hal-target-backends=vulkan-spirv",
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index ef25ee0ec..f70b82cbc 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -81,6 +81,7 @@ def __init__(
         external_weights_dir: str = "./shark_weights",
         external_weights: str = "safetensors",
         vae_decomp_attn: bool = True,
+        custom_vae: str = "",
     ):
         self.hf_model_name = hf_model_name
         self.scheduler_id = scheduler_id
@@ -99,6 +100,7 @@ def __init__(
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
+        self.custom_vae = custom_vae
 
     # FILE MANAGEMENT AND PIPELINE SETUP
 

From 0ad9050b2c01757e138d1f84ec49add9665394ab Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 12:29:50 -0500
Subject: [PATCH 073/174] Fixes to match test function calls.

---
 models/turbine_models/custom_models/sd_inference/clip.py     | 2 +-
 .../turbine_models/custom_models/sd_inference/sd_pipeline.py | 5 +++--
 models/turbine_models/custom_models/sd_inference/unet.py     | 1 +
 models/turbine_models/custom_models/sd_inference/vae.py      | 1 +
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index ef69e8a6d..e7fc8a5d8 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -45,7 +45,7 @@
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 
 
-def export_clip(
+def export_clip_model(
     hf_model_name,
     hf_auth_token: str = None,
     max_length: int = 64,
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index e0239e4d5..7d3796dfc 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -241,7 +241,7 @@ def export_submodel(
             input_mlir = copy.deepcopy(SUBMODELS)
         match submodel:
             case "clip":
-                _, clip_vmfb = clip.export_clip(
+                _, clip_vmfb = clip.export_clip_model(
                     self.hf_model_name,
                     None,
                     self.max_length,
@@ -262,7 +262,7 @@ def export_submodel(
             case "scheduler":
                 if self.cpu_scheduling:
                     return (None, None)
-                scheduler = schedulers.export_scheduler(
+                scheduler = schedulers.export_scheduler_model(
                     self.hf_model_name,
                     self.scheduler_id,
                     self.batch_size,
@@ -446,6 +446,7 @@ def generate_images(
             sample, add_time_ids, timesteps = self.scheduler.initialize(samples[i])
 
             if self.is_img2img:
+                raise AssertionError, "Image-to-image not supported yet."
                 strength = 0.5  # should be user-facing
                 init_timestep = min(
                     int(self.num_inference_steps * strength), self.num_inference_steps
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 86facb772..ac66d3108 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -66,6 +66,7 @@ def export_unet_model(
     attn_spec=None,
     input_mlir=None,
     weights_only=False,
+    upload_ir=False,
 ):
     if "turbo" in hf_model_name:
         do_classifier_free_guidance = False
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index a960cc115..bd9e99a23 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -85,6 +85,7 @@ def export_vae_model(
     attn_spec=None,
     input_mlir=None,
     weights_only=False,
+    upload_ir=False,
 ):
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, "vae_" + variant)

From dd13911f8dc81b36c006b0c1ba7aa99d9c20d2f0 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 12:51:40 -0500
Subject: [PATCH 074/174] Enable fetch of wmma spec.

---
 .../custom_models/sd_inference/utils.py       | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 1c77541c7..b1eb77a02 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -1,3 +1,4 @@
+from urllib.request import urlopen
 import iree.compiler as ireec
 import numpy as np
 import os
@@ -131,10 +132,9 @@ def compile_to_vmfb(
     # the TD spec is implemented in C++.
     if attn_spec not in [None, "", " "]:
         if attn_spec in ["default", "mfma"]:
-            attn_spec = os.path.join(
-                os.path.realpath(os.path.dirname(__file__)),
-                "default_mfma_attn_spec.mlir",
-            )
+            attn_spec = get_mfma_spec_path(target_triple, os.path.dirname(safe_name))
+        elif attn_spec in ["wmma"]:
+            attn_spec = get_wmma_spec_path(target_triple, os.path.dirname(safe_name))
         flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
 
     print("Compiling to", device, "with flags:", flags)
@@ -173,6 +173,27 @@ def create_safe_name(hf_model_name, model_name_str):
     return safe_name
 
 
+def get_mfma_spec_path(target_chip, save_dir):
+    url = "https://raw.githubusercontent.com/iree-org/iree/main/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir"
+    attn_spec = urlopen(url).read().decode("utf-8")
+    spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
+    with open(spec_path, "w") as f:
+        f.write(attn_spec)
+    return spec_path
+
+
+def get_wmma_spec_path(target_chip, save_dir):
+    if target_chip == "gfx1100":
+        url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1100.spec.mlir"
+    elif target_chip == "gfx1103":
+        url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1103.spec.mlir"
+    attn_spec = urlopen(url).read().decode("utf-8")
+    spec_path = os.path.join(save_dir, "attention_and_matmul_spec_wmma.mlir")
+    with open(spec_path, "w") as f:
+        f.write(attn_spec)
+    return spec_path
+
+
 def save_external_weights(
     mapper,
     model,

From bd222b80a9ec8905687e04b1543c2e5cb2738c91 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 23 May 2024 15:08:00 -0500
Subject: [PATCH 075/174] Small fixes.

---
 .../custom_models/sd_inference/clip.py              |  2 +-
 .../custom_models/sd_inference/sd_pipeline.py       |  2 +-
 .../custom_models/sd_inference/utils.py             | 13 ++++++++-----
 models/turbine_models/tests/sd_test.py              | 11 ++++-------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index e7fc8a5d8..a15426d28 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -174,7 +174,7 @@ def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
             const_expr_hoisting=True,
             attn_spec=td_spec,
         )
-        return None, vmfb_path
+        return vmfb_path, None
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 7d3796dfc..2d3a0bc50 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -446,7 +446,7 @@ def generate_images(
             sample, add_time_ids, timesteps = self.scheduler.initialize(samples[i])
 
             if self.is_img2img:
-                raise AssertionError, "Image-to-image not supported yet."
+                raise AssertionError("Image-to-image not supported yet.")
                 strength = 0.5  # should be user-facing
                 init_timestep = min(
                     int(self.num_inference_steps * strength), self.num_inference_steps
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index b1eb77a02..173099845 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -130,12 +130,13 @@ def compile_to_vmfb(
     # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
     # This is a temporary solution, and should be removed or largely disabled once the functionality of
     # the TD spec is implemented in C++.
-    if attn_spec not in [None, "", " "]:
-        if attn_spec in ["default", "mfma"]:
-            attn_spec = get_mfma_spec_path(target_triple, os.path.dirname(safe_name))
-        elif attn_spec in ["wmma"]:
-            attn_spec = get_wmma_spec_path(target_triple, os.path.dirname(safe_name))
+    if attn_spec in ["default", "mfma"]:
+        attn_spec = get_mfma_spec_path(target_triple, os.path.dirname(safe_name))
         flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
+    elif attn_spec in ["wmma"]:
+        attn_spec = get_wmma_spec_path(target_triple, os.path.dirname(safe_name))
+        if attn_spec:
+            flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
 
     print("Compiling to", device, "with flags:", flags)
 
@@ -187,6 +188,8 @@ def get_wmma_spec_path(target_chip, save_dir):
         url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1100.spec.mlir"
     elif target_chip == "gfx1103":
         url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1103.spec.mlir"
+    else:
+        return None
     attn_spec = urlopen(url).read().decode("utf-8")
     spec_path = os.path.join(save_dir, "attention_and_matmul_spec_wmma.mlir")
     with open(spec_path, "w") as f:
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index a4339e477..7482c374f 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -46,7 +46,6 @@
     "device": "cpu",
     "rt_device": "local-task",
     "iree_target_triple": "x86_64-linux-gnu",
-    "vulkan_max_allocation": "4294967296",
     "prompt": "a photograph of an astronaut riding a horse",
     "in_channels": 4,
 }
@@ -89,10 +88,9 @@ def testExportT5Model(self):
             external_weight_path=None,
             device="cpu",
             target_triple=None,
-            max_alloc=None,
             upload_ir=UPLOAD_IR,
         )
-        current_args["vmfb_path"] = safe_prefix + "_clip.vmfb"
+        current_args["vmfb_path"] = blob_name
         turbine = clip_runner.run_clip(
             current_args["rt_device"],
             current_args["prompt"],
@@ -126,7 +124,6 @@ def testExportClipVitLarge14(self):
             external_weight_path=safe_prefix + ".safetensors",
             device="cpu",
             target_triple=None,
-            max_alloc=None,
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = safe_prefix + ".safetensors"
@@ -276,7 +273,7 @@ def testExportVaeModelDecode(self):
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
-        current_args["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb"
+        current_args["vmfb_path"] = blob_name
         example_input = torch.rand(
             current_args["batch_size"],
             4,
@@ -325,7 +322,7 @@ def testExportVaeModelEncode(self):
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
-        current_args["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb"
+        current_args["vmfb_path"] = blob_name
         example_input = torch.rand(
             current_args["batch_size"],
             3,
@@ -372,7 +369,7 @@ def testExportPNDMScheduler(self):
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = safe_name + ".safetensors"
-        current_args["vmfb_path"] = safe_name + ".vmfb"
+        current_args["vmfb_path"] = blob_name
         sample = torch.rand(
             current_args["batch_size"],
             4,

From 5a832428c27225cc7cab32dffbd1300a76aa1038 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sat, 25 May 2024 14:35:50 -0500
Subject: [PATCH 076/174] Fixes for gfx1100, WIP sd1/2 pipe fixes

---
 .../custom_models/sd_inference/sd_pipeline.py |  6 +++---
 .../sd_inference/tokenization.py              |  2 +-
 .../custom_models/sd_inference/utils.py       | 19 ++++++++++++++++++-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 2d3a0bc50..99857da8b 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -347,15 +347,15 @@ def load_pipeline(
     ):
         self.runners = {}
         runners = {}
-        runners["tokenizers"] = []
-        runners["tokenizers"].append(
+        self.tokenizers = []
+        self.tokenizers.append(
             CLIPTokenizer.from_pretrained(
                 self.hf_model_name,
                 subfolder="tokenizer",
             )
         )
         if self.is_sdxl:
-            runners["tokenizers"].append(
+            self.tokenizers.append(
                 CLIPTokenizer.from_pretrained(
                     self.hf_model_name,
                     subfolder="tokenizer_2",
diff --git a/models/turbine_models/custom_models/sd_inference/tokenization.py b/models/turbine_models/custom_models/sd_inference/tokenization.py
index 18d88da1f..124d3978f 100644
--- a/models/turbine_models/custom_models/sd_inference/tokenization.py
+++ b/models/turbine_models/custom_models/sd_inference/tokenization.py
@@ -346,7 +346,7 @@ def get_weighted_text_embeddings(
     skip_weighting: Optional[bool] = False,
 ):
     max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
-    for tokenizer in pipe.runners["tokenizers"]:
+    for tokenizer in pipe.tokenizers:
         (
             prompt_tokens,
             prompt_weights,
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 173099845..f31635ccb 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -32,10 +32,24 @@
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
-        "--iree-codegen-llvmgpu-use-vector-distribution=true",
     ],
     "vae": ["--iree-flow-enable-aggressive-fusion"],
 }
+GFX11_flags = {
+    "all": [
+        "--iree-global-opt-propagate-transposes=true",
+        "--iree-opt-outer-dim-concat=true",
+        "--iree-vm-target-truncate-unsupported-floats",
+        "--iree-llvmgpu-enable-prefetch=true",
+        "--iree-opt-data-tiling=false",
+        "--iree-codegen-gpu-native-math-precision=true",
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+    ],
+    "unet": [""],
+    "clip": [""],
+    "vae": [""],
+}
 
 
 def compile_to_vmfb(
@@ -126,6 +140,9 @@ def compile_to_vmfb(
             flags.extend(MI_flags["vae"])
         flags.extend(MI_flags["all"])
 
+    if target_triple in ["gfx1100", "gfx1103", "gfx1150"]:
+        flags.extend(GFX11_flags["all"])
+
     # Currently, we need a transform dialect script to be applied to the compilation through IREE in certain cases.
     # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
     # This is a temporary solution, and should be removed or largely disabled once the functionality of

From 815da51e949473b7bcc69bc04ae83cb9278e30b5 Mon Sep 17 00:00:00 2001
From: ean garvey <ean.garvey@amd.com>
Date: Sat, 25 May 2024 18:55:03 -0400
Subject: [PATCH 077/174] turbo fixes

---
 .../custom_models/sd_inference/utils.py       | 15 ++++++++++++--
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  1 +
 .../sdxl_inference/sdxl_prompt_encoder.py     |  2 +-
 .../sdxl_inference/sdxl_scheduled_unet.py     | 20 +++++++++++++------
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index f31635ccb..d7f5f7bf2 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -20,20 +20,23 @@
         "--iree-opt-data-tiling=false",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-rocm-waves-per-eu=2",
-        "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
     "unet": [
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
+        "--iree-codegen-llvmgpu-use-vector-distribution=false",        
     ],
     "clip": [
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
     ],
-    "vae": ["--iree-flow-enable-aggressive-fusion"],
+    "vae": [
+        "--iree-flow-enable-aggressive-fusion",
+        "--iree-codegen-llvmgpu-use-vector-distribution=true"
+    ],
 }
 GFX11_flags = {
     "all": [
@@ -122,6 +125,14 @@ def compile_to_vmfb(
     elif ireec_flags == None:
         ireec_flags = []
 
+    debug = True
+    if debug:
+        flags.extend(
+            [
+                "--iree-hal-dump-executable-files-to=" + safe_name + "_dispatches"
+            ]
+        )
+
     for i, flag in enumerate(ireec_flags):
         k = flag.strip().split("=")[0]
         for idx, default in enumerate(flags):
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index f70b82cbc..bdc015096 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -101,6 +101,7 @@ def __init__(
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
         self.custom_vae = custom_vae
+        self.do_classifier_free_guidance = False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
     # FILE MANAGEMENT AND PIPELINE SETUP
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index fcd98be67..ad17ab82e 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -37,7 +37,7 @@ def __init__(
             subfolder="text_encoder_2",
             token=hf_auth_token,
         )
-        self.do_classifier_free_guidance = do_classifier_free_guidance
+        self.do_classifier_free_guidance = False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
     def forward(
         self, text_input_ids_1, text_input_ids_2, uncond_input_ids_1, uncond_input_ids_2
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index a0f3e0390..b46f1d264 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -37,6 +37,9 @@ def __init__(
         return_index=False,
     ):
         super().__init__()
+        self.do_classifier_free_guidance = True
+        if any(key in hf_model_name for key in ["turbo", "lightning"]):
+            self.do_classifier_free_guidance = False 
         self.dtype = torch.float16 if precision == "fp16" else torch.float32
         self.scheduler = utils.get_schedulers(hf_model_name)[scheduler_id]
         if scheduler_id == "PNDM":
@@ -89,11 +92,15 @@ def forward(
     ):
         with torch.no_grad():
             added_cond_kwargs = {
-                "text_embeds": text_embeds,
                 "time_ids": time_ids,
+                "text_embeds": text_embeds,
             }
             t = self.scheduler.timesteps[step_index]
-            latent_model_input = torch.cat([sample] * 2)
+            if self.do_classifier_free_guidance:
+                latent_model_input = torch.cat([sample] * 2)
+            else:
+                latent_model_input = sample.clone()
+
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
             noise_pred = self.unet.forward(
                 latent_model_input,
@@ -104,10 +111,11 @@ def forward(
                 return_dict=False,
             )[0]
 
-            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
             sample = self.scheduler.step(noise_pred, t, sample, return_dict=False)[0]
             return sample.type(self.dtype)
 

From 8348ff7d44416742f7419fed2804392138c14c4f Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sat, 25 May 2024 17:55:47 -0500
Subject: [PATCH 078/174] various sd1.5/2.1 fixes

---
 .../custom_models/sd_inference/schedulers.py  |  37 ++--
 .../custom_models/sd_inference/sd_pipeline.py | 161 +++++++++---------
 .../sd_inference/tokenization.py              |  25 +--
 .../custom_models/sd_inference/utils.py       |   2 +-
 4 files changed, 122 insertions(+), 103 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index c7ecc0d20..a80d2c854 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -49,12 +49,13 @@ def step(self, sample, latents, t):
 
 
 class SchedulingModel(torch.nn.Module):
-    def __init__(self, scheduler, height, width, num_inference_steps):
+    def __init__(self, scheduler, height, width, num_inference_steps, dtype):
         self.model = scheduler
         self.height = height
         self.width = width
         self.model.set_timesteps(num_inference_steps)
         self.model.is_scale_input_called = True
+        self.dtype = dtype
 
     def initialize(self, sample):
         height = sample.shape[-2] * 8
@@ -72,33 +73,37 @@ def initialize(self, sample):
         return sample.type(self.dtype), add_time_ids, step_indexes
 
     def scale_model_input(self, sample, t):
-        self.model.scale_model_input(sample, t)
+        return self.model.scale_model_input(sample, t)
 
-    def step(self, sample, latents, t):
-        self.model.step(self, sample, latents, t)
+    def step(self, latents, t, sample):
+        return self.model.step(latents, t, sample)
 
 
-class SharkSchedulerCPUWrapper(SchedulingModel):
-    def __init__(self, pipe, scheduler, height, width):
-        super().__init__(scheduler, height, width)
-        self.dest = pipe.runner["unet"].config.device
+class SharkSchedulerCPUWrapper():
+    def __init__(self, pipe, scheduler):
+        self.module = scheduler
+        self.dest = pipe.runners["unet"].config.device
         self.dtype = pipe.iree_dtype
 
     def initialize(self, sample):
-        for output in super().initialize(sample):
-            iree_arrays = ireert.asdevicearray(self.dest, output, self.dtype)
+        sample, add_time_ids, step_indexes = self.module.initialize(torch.from_numpy(sample.to_host()))
+        sample = ireert.asdevicearray(self.dest, sample, self.dtype)
+        add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
 
-        return iree_arrays
+        return sample, add_time_ids, step_indexes
 
     def scale_model_input(self, sample, t):
-        return ireert.asdevicearray(
-            self.dest, super.scale_model_input(sample, t), self.dtype
+        scaled = ireert.asdevicearray(
+            self.dest, self.module.scale_model_input(torch.from_numpy(sample.to_host()), t), self.dtype
         )
+        t = [self.module.model.timesteps[t]]
+        t = ireert.asdevicearray(self.dest, t, self.dtype)
+        return scaled, t
 
-    def step(self, sample, latents, t):
+    def step(self, latents, t, sample):
         return ireert.asdevicearray(
             self.dest,
-            super.step(sample.to_host(), latents.to_host(), t.to_host()),
+            self.module.step(torch.from_numpy(latents.to_host()), t, torch.from_numpy(sample.to_host())).prev_sample,
             self.dtype,
         )
 
@@ -261,7 +266,7 @@ def get_scheduler(model_id, scheduler_id):
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
 
-    mod_str = export_scheduler(
+    mod_str = export_scheduler_model(
         args.hf_model_name,
         args.scheduler_id,
         args.batch_size,
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 99857da8b..5d227c3f4 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -8,6 +8,8 @@
 import copy
 import torch
 import iree.runtime as ireert
+from random import randint
+from tqdm.auto import tqdm
 from turbine_models.custom_models.sd_inference import (
     clip,
     unet,
@@ -74,6 +76,8 @@ def __init__(
         vae_decomp_attn: bool = True,
     ):
         self.hf_model_name = hf_model_name
+        self.iree_dtype = "float32" if precision == "fp32" else "float16"
+        self.torch_dtype = torch.float32 if precision == "fp32" else torch.float16
         self.cpu_scheduling = True
         self.scheduler_id = scheduler_id
         self.height = height
@@ -345,6 +349,7 @@ def load_pipeline(
         rt_device: str = "local-task",
         compiled_pipeline: bool = False,
     ):
+        self.is_img2img = False
         self.runners = {}
         runners = {}
         self.tokenizers = []
@@ -361,30 +366,55 @@ def load_pipeline(
                     subfolder="tokenizer_2",
                 )
             )
-
         runners["clip"] = vmfbRunner(rt_device, vmfbs["clip"], weights["clip"])
-        if self.cpu_scheduling:
-            self.scheduler = schedulers.SchedulingModel(
-                schedulers.get_scheduler(self.hf_model_name, self.scheduler_id),
-                self.height,
-                self.width,
-                self.num_inference_steps,
-            )
-        else:
-            self.scheduler = schedulers.SharkSchedulerWrapper(
-                rt_device, vmfbs["scheduler"], weights["scheduler"]
-            )
-
         runners["unet"] = vmfbRunner(rt_device, vmfbs["unet"], weights["unet"])
         runners["vae_decode"] = vmfbRunner(
             rt_device, vmfbs["vae_decode"], weights["vae_decode"]
         )
         self.runners = runners
         self.compiled_pipeline = False
+        if self.cpu_scheduling:
+            # torch_scheduler = schedulers.SchedulingModel(
+            #     schedulers.get_scheduler(self.hf_model_name, self.scheduler_id),
+            #     self.height,
+            #     self.width,
+            #     self.num_inference_steps,
+            #     self.torch_dtype,
+            # )
+            # self.scheduler = schedulers.SharkSchedulerCPUWrapper(
+            #     self, torch_scheduler
+            # )
+            self.scheduler = schedulers.get_scheduler(self.hf_model_name, self.scheduler_id)
+        else:
+            self.scheduler = schedulers.SharkSchedulerWrapper(
+                rt_device, vmfbs["scheduler"], weights["scheduler"]
+            )
         print("Successfully loaded pipeline.")
 
     # RUN
 
+    def prepare_latents(
+        self,
+        noise,
+        num_inference_steps,
+        image,
+        strength,
+    ):
+        self.scheduler.set_timesteps(num_inference_steps)
+        if self.is_img2img:
+            init_timestep = min(
+                int(num_inference_steps * strength), num_inference_steps
+            )
+            t_start = max(num_inference_steps - init_timestep, 0)
+            timesteps = self.scheduler.timesteps[t_start:]
+            latents = self.encode_image(image)
+            latents = self.scheduler.add_noise(latents, noise, timesteps[0].repeat(1))
+            return latents, [timesteps]
+        else:
+            self.scheduler.is_scale_input_called = True
+            latents = noise * self.scheduler.init_noise_sigma
+            return latents, self.scheduler.timesteps
+
     def generate_images(
         self,
         prompt: str,
@@ -394,16 +424,17 @@ def generate_images(
         seed: float = -1,
         return_imgs: bool = False,
     ):
-        # TODO: implement case where this is false e.g. in SDXL Turbo
-        # do_classifier_free_guidance = True
-
-        self.iree_dtype = "float32" if self.precision == "fp32" else "float16"
-        torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
 
         pipe_start = time.time()
         samples = []
         numpy_images = []
+        
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
 
+        generator = torch.manual_seed(seed)
         for i in range(batch_count):
             generator = torch.random.manual_seed(seed + i)
             rand_sample = torch.randn(
@@ -414,15 +445,16 @@ def generate_images(
                     self.width // 8,
                 ),
                 generator=generator,
-                dtype=torch_dtype,
-            )
-            samples.append(
-                ireert.asdevicearray(
-                    self.runners["unet"].config.device,
-                    rand_sample,
-                    dtype=self.iree_dtype,
-                )
+                dtype=self.torch_dtype,
             )
+            samples.append(rand_sample)
+            # samples.append(
+            #     ireert.asdevicearray(
+            #         self.runners["unet"].config.device,
+            #         rand_sample,
+            #         dtype=self.iree_dtype,
+            #     )
+            # )
 
         guidance_scale = ireert.asdevicearray(
             self.runners["unet"].config.device,
@@ -438,62 +470,39 @@ def generate_images(
             self, prompt, negative_prompt
         )
 
+        text_embeddings = torch.cat((negative_embeds, prompt_embeds), dim=0)
+        text_embeddings = ireert.asdevicearray(
+            self.runners["unet"].config.device,
+            text_embeddings,
+            dtype=self.iree_dtype,
+        )
         encode_prompts_end = time.time()
 
         for i in range(batch_count):
             unet_start = time.time()
-
-            sample, add_time_ids, timesteps = self.scheduler.initialize(samples[i])
-
-            if self.is_img2img:
-                raise AssertionError("Image-to-image not supported yet.")
-                strength = 0.5  # should be user-facing
-                init_timestep = min(
-                    int(self.num_inference_steps * strength), self.num_inference_steps
-                )
-                t_start = max(self.num_inference_steps - init_timestep, 0)
-                timesteps = self.scheduler.timesteps[t_start:]
-                latents = self.encode_image(image)
-                latents = self.scheduler.add_noise(
-                    latents, sample, timesteps[0].repeat(1)
-                )
-                return latents, [timesteps]
-
-            if self.cpu_scheduling:
-                sample = ireert.asdevicearray(
-                    self.runners["unet"].config.device,
-                    np.asarray(sample),
-                    dtype=self.iree_dtype,
-                )
-                add_time_ids = ireert.asdevicearray(
-                    self.runners["unet"].config.device,
-                    np.asarray(add_time_ids),
-                    dtype=self.iree_dtype,
-                )
-                timesteps = ireert.asdevicearray(
-                    self.runners["unet"].config.device,
-                    np.asarray(timesteps),
-                    dtype=self.iree_dtype,
-                )
-
-            for t in range(timesteps):
-                latents = self.scheduler.scale_model_input(sample, t)
-                latents = self.runners["unet"].ctx.modules.compiled_unet["main"](
+            image = None
+            strength = 0
+            sample, timesteps = self.prepare_latents(samples[i], self.num_inference_steps, image, strength)
+
+            for i, t in tqdm(enumerate(timesteps)):
+                latents = self.scheduler.scale_model_input(sample, t).to(self.torch_dtype)
+                timestep = torch.tensor([t]).to(self.torch_dtype).detach().numpy()
+                unet_inputs = [
                     latents,
-                    prompt_embeds,
-                    negative_embeds,
-                    add_time_ids,
-                    guidance_scale,
-                    t,
-                )
-                sample = self.scheduler.step(sample, latents, t)
-
-            if self.cpu_scheduling:
-                sample = ireert.asdevicearray(
-                    self.runners["vae_decode"].config.device,
-                    np.asarray(sample),
-                    dtype=self.iree_dtype,
+                    timestep,
+                ]
+                if self.cpu_scheduling:
+                    for inp in unet_inputs:
+                        inp = ireert.asdevicearray(
+                            self.runners["unet"].config.device,
+                            inp,
+                            dtype=self.iree_dtype,
+                        )
+                unet_inputs.extend([text_embeddings, guidance_scale])
+                latents = self.runners["unet"].ctx.modules.compiled_unet["main"](
+                    *unet_inputs
                 )
+                sample = self.scheduler.step(torch.tensor(latents.to_host(), dtype=self.torch_dtype), t, sample).prev_sample
 
             vae_start = time.time()
             vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
diff --git a/models/turbine_models/custom_models/sd_inference/tokenization.py b/models/turbine_models/custom_models/sd_inference/tokenization.py
index 124d3978f..384642744 100644
--- a/models/turbine_models/custom_models/sd_inference/tokenization.py
+++ b/models/turbine_models/custom_models/sd_inference/tokenization.py
@@ -111,7 +111,7 @@ def multiply_range(start_position, multiplier):
     return res
 
 
-def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
+def get_prompts_with_weights(tokenizer, prompt: List[str], max_length: int):
     r"""
     Tokenize a list of prompts and return its tokens with weights of each token.
     No padding, starting or ending token is included.
@@ -125,7 +125,7 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
         text_weight = []
         for word, weight in texts_and_weights:
             # tokenize and discard the starting and the ending token
-            token = pipe.tokenizer(word).input_ids[1:-1]
+            token = tokenizer(word).input_ids[1:-1]
             text_token += token
             # copy the weight by length of token
             text_weight += [weight] * len(token)
@@ -211,13 +211,12 @@ def get_unweighted_text_embeddings(
             text_input_chunk[:, -1] = text_input[0, -1]
 
             text_input_chunk = ireert.asdevicearray(
-                pipe.runners["clip"].config.device, text_input_chunk, pipe.iree_dtype
+                pipe.runners["clip"].config.device, text_input_chunk, "int64"
             )
             text_embedding = (
                 pipe.runners["clip"]
-                .ctx.modules.compiled_clip["encode_prompts"](text_input_chunk)
-                .to_host()
-            )
+                .ctx.modules.compiled_clip["main"](text_input_chunk)
+            )[0].to_host()
             if no_boseos_middle:
                 if i == 0:
                     # discard the ending token
@@ -235,8 +234,14 @@ def get_unweighted_text_embeddings(
         text_embeddings_np = np.concatenate(np.array(text_embeddings))
         text_embeddings = torch.from_numpy(text_embeddings_np)
     else:
-        text_embeddings = pipe.run("clip", text_input)[0]
-        text_embeddings = torch.from_numpy(text_embeddings.to_host())
+        text_input = ireert.asdevicearray(
+            pipe.runners["clip"].config.device, text_input, "int64"
+        )
+        text_embeddings = (
+            pipe.runners["clip"]
+            .ctx.modules.compiled_clip["main"](text_input)
+        )[0].to_host()
+        text_embeddings = torch.from_numpy(text_embeddings)
     return text_embeddings
 
 
@@ -259,11 +264,11 @@ def get_tokenized_inputs(
 ):
     if not skip_parsing:
         prompt_tokens, prompt_weights = get_prompts_with_weights(
-            pipe, prompt, max_length - 2
+            tokenizer, prompt, max_length - 2
         )
         if uncond_prompt is not None:
             uncond_tokens, uncond_weights = get_prompts_with_weights(
-                pipe, uncond_prompt, max_length - 2
+                tokenizer, uncond_prompt, max_length - 2
             )
     else:
         prompt_tokens = [
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index d7f5f7bf2..6498d8872 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -65,7 +65,7 @@ def compile_to_vmfb(
     const_expr_hoisting=True,
     mlir_source="str",
     max_alloc="4294967296",
-    save_mlir=False,
+    save_mlir=True,
     attn_spec=None,
 ):
     flags = []

From 11430ee7d983665ddd9ebaf64cc7396b5d55f4e9 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sun, 26 May 2024 01:54:48 -0500
Subject: [PATCH 079/174] Turbo support #2

---
 .../custom_models/sd_inference/utils.py       |  2 +-
 .../sdxl_inference/pipeline_ir.py             | 22 +++++++++++++++++++
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  5 ++++-
 .../sdxl_inference/sdxl_scheduled_unet.py     |  8 ++++---
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 6498d8872..462798bf4 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -125,7 +125,7 @@ def compile_to_vmfb(
     elif ireec_flags == None:
         ireec_flags = []
 
-    debug = True
+    debug = False
     if debug:
         flags.extend(
             [
diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index 0348666d3..c38d1a818 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -72,6 +72,28 @@
 }
 """
 
+sdxl_turbo_sched_unet_bench_f16 = """
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<1x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<1x64x2048xf16>, %arg2: tensor<1x1280xf16>, %arg3: tensor<1x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  
+  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf16>, %p_embeds: tensor<1x64x2048xf16>, %t_embeds: tensor<1x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x128x128xf16> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<1x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<1x64x2048xf16>, tensor<1x1280xf16>, tensor<1x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    return %res : tensor<1x4x128x128xf16>
+  } 
+}
+"""
+
 sdxl_sched_unet_bench_f32 = """
 module @sdxl_compiled_pipeline {
   func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index bdc015096..6cdcbb242 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -16,6 +16,7 @@
 from turbine_models.custom_models.sdxl_inference.pipeline_ir import (
     sdxl_sched_unet_bench_f32,
     sdxl_sched_unet_bench_f16,
+    sdxl_turbo_sched_unet_bench_f16,
     sdxl_pipeline_bench_f32,
     sdxl_pipeline_bench_f16,
 )
@@ -354,6 +355,9 @@ def export_submodel(
                     if self.precision == "fp32"
                     else sdxl_sched_unet_bench_f16
                 )
+                if self.do_classifier_free_guidance == False:
+                    assert self.precision == "fp16", "turbo only supported in fp16 precision."
+                    pipeline_file = sdxl_turbo_sched_unet_bench_f16
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
                     self.device,
@@ -551,7 +555,6 @@ def generate_images(
 
             for i in range(batch_count):
                 unet_start = time.time()
-
                 latents = self.runners["pipe"].ctx.modules.sdxl_compiled_pipeline[
                     "produce_image_latents"
                 ](samples[i], prompt_embeds, add_text_embeds, guidance_scale)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index b46f1d264..e2f6f8b8a 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -14,6 +14,7 @@
 from iree.compiler.ir import Context
 import numpy as np
 from shark_turbine.aot import *
+import shark_turbine.ops as ops
 from turbine_models.custom_models.sd_inference import utils
 import torch
 import torch._dynamo as dynamo
@@ -79,9 +80,10 @@ def initialize(self, sample):
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
         add_time_ids = list(original_size + crops_coords_top_left + target_size)
-        add_time_ids = torch.tensor([add_time_ids])
-        add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
-        add_time_ids = add_time_ids.repeat(sample.shape[0], 1).type(self.dtype)
+        add_time_ids = torch.tensor([add_time_ids], dtype=self.dtype)
+        if self.do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+            add_time_ids = add_time_ids.repeat(sample.shape[0], 1).type(self.dtype)
         timesteps = self.scheduler.timesteps
         step_indexes = torch.tensor(len(timesteps))
         sample = sample * self.scheduler.init_noise_sigma

From b8a39feabbb3a4c5a5b4d018e8fc4f20414b0289 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 27 May 2024 00:01:12 -0500
Subject: [PATCH 080/174] Disable CFG-free turbo exports, small fixes

---
 models/requirements.txt                       |   1 +
 .../custom_models/sd_inference/schedulers.py  |  16 ++-
 .../custom_models/sd_inference/sd_pipeline.py |  21 ++--
 .../sd_inference/tokenization.py              |   6 +-
 .../custom_models/sd_inference/utils.py       |  14 ++-
 .../sdxl_inference/pipeline_ir.py             |  16 +--
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  28 +++--
 .../sdxl_inference/sdxl_prompt_encoder.py     |  46 ++++++-
 .../sdxl_inference/sdxl_scheduled_unet.py     |  15 ++-
 .../sdxl_scheduled_unet_runner.py             | 119 +++---------------
 models/turbine_models/model_runner.py         |   3 +-
 11 files changed, 136 insertions(+), 149 deletions(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index b140ea65c..1718afb5d 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -2,6 +2,7 @@ protobuf
 sentencepiece
 shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 transformers==4.37.1
+torchsde
 accelerate
 diffusers @ git+https://github.com/nod-ai/diffusers@v0.24.0-release
 # turbine tank downloading/uploading
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index a80d2c854..5cc37c8b5 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -79,14 +79,16 @@ def step(self, latents, t, sample):
         return self.model.step(latents, t, sample)
 
 
-class SharkSchedulerCPUWrapper():
+class SharkSchedulerCPUWrapper:
     def __init__(self, pipe, scheduler):
         self.module = scheduler
         self.dest = pipe.runners["unet"].config.device
         self.dtype = pipe.iree_dtype
 
     def initialize(self, sample):
-        sample, add_time_ids, step_indexes = self.module.initialize(torch.from_numpy(sample.to_host()))
+        sample, add_time_ids, step_indexes = self.module.initialize(
+            torch.from_numpy(sample.to_host())
+        )
         sample = ireert.asdevicearray(self.dest, sample, self.dtype)
         add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
 
@@ -94,7 +96,9 @@ def initialize(self, sample):
 
     def scale_model_input(self, sample, t):
         scaled = ireert.asdevicearray(
-            self.dest, self.module.scale_model_input(torch.from_numpy(sample.to_host()), t), self.dtype
+            self.dest,
+            self.module.scale_model_input(torch.from_numpy(sample.to_host()), t),
+            self.dtype,
         )
         t = [self.module.model.timesteps[t]]
         t = ireert.asdevicearray(self.dest, t, self.dtype)
@@ -103,7 +107,11 @@ def scale_model_input(self, sample, t):
     def step(self, latents, t, sample):
         return ireert.asdevicearray(
             self.dest,
-            self.module.step(torch.from_numpy(latents.to_host()), t, torch.from_numpy(sample.to_host())).prev_sample,
+            self.module.step(
+                torch.from_numpy(latents.to_host()),
+                t,
+                torch.from_numpy(sample.to_host()),
+            ).prev_sample,
             self.dtype,
         )
 
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 5d227c3f4..7b0411ec7 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -130,7 +130,7 @@ def check_prepared(
                         weights[submodel] = weight
                 ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
                 if ready:
-                    print("All necessary files found. Generating images.")
+                    print("All necessary files found.")
                     return vmfbs, weights
                 else:
                     print("There was an error generating the necessary files.")
@@ -384,7 +384,9 @@ def load_pipeline(
             # self.scheduler = schedulers.SharkSchedulerCPUWrapper(
             #     self, torch_scheduler
             # )
-            self.scheduler = schedulers.get_scheduler(self.hf_model_name, self.scheduler_id)
+            self.scheduler = schedulers.get_scheduler(
+                self.hf_model_name, self.scheduler_id
+            )
         else:
             self.scheduler = schedulers.SharkSchedulerWrapper(
                 rt_device, vmfbs["scheduler"], weights["scheduler"]
@@ -424,11 +426,10 @@ def generate_images(
         seed: float = -1,
         return_imgs: bool = False,
     ):
-
         pipe_start = time.time()
         samples = []
         numpy_images = []
-        
+
         uint32_info = np.iinfo(np.uint32)
         uint32_min, uint32_max = uint32_info.min, uint32_info.max
         if seed < uint32_min or seed >= uint32_max:
@@ -482,10 +483,14 @@ def generate_images(
             unet_start = time.time()
             image = None
             strength = 0
-            sample, timesteps = self.prepare_latents(samples[i], self.num_inference_steps, image, strength)
+            sample, timesteps = self.prepare_latents(
+                samples[i], self.num_inference_steps, image, strength
+            )
 
             for i, t in tqdm(enumerate(timesteps)):
-                latents = self.scheduler.scale_model_input(sample, t).to(self.torch_dtype)
+                latents = self.scheduler.scale_model_input(sample, t).to(
+                    self.torch_dtype
+                )
                 timestep = torch.tensor([t]).to(self.torch_dtype).detach().numpy()
                 unet_inputs = [
                     latents,
@@ -502,7 +507,9 @@ def generate_images(
                 latents = self.runners["unet"].ctx.modules.compiled_unet["main"](
                     *unet_inputs
                 )
-                sample = self.scheduler.step(torch.tensor(latents.to_host(), dtype=self.torch_dtype), t, sample).prev_sample
+                sample = self.scheduler.step(
+                    torch.tensor(latents.to_host(), dtype=self.torch_dtype), t, sample
+                ).prev_sample
 
             vae_start = time.time()
             vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
diff --git a/models/turbine_models/custom_models/sd_inference/tokenization.py b/models/turbine_models/custom_models/sd_inference/tokenization.py
index 384642744..cfc140c57 100644
--- a/models/turbine_models/custom_models/sd_inference/tokenization.py
+++ b/models/turbine_models/custom_models/sd_inference/tokenization.py
@@ -214,8 +214,7 @@ def get_unweighted_text_embeddings(
                 pipe.runners["clip"].config.device, text_input_chunk, "int64"
             )
             text_embedding = (
-                pipe.runners["clip"]
-                .ctx.modules.compiled_clip["main"](text_input_chunk)
+                pipe.runners["clip"].ctx.modules.compiled_clip["main"](text_input_chunk)
             )[0].to_host()
             if no_boseos_middle:
                 if i == 0:
@@ -238,8 +237,7 @@ def get_unweighted_text_embeddings(
             pipe.runners["clip"].config.device, text_input, "int64"
         )
         text_embeddings = (
-            pipe.runners["clip"]
-            .ctx.modules.compiled_clip["main"](text_input)
+            pipe.runners["clip"].ctx.modules.compiled_clip["main"](text_input)
         )[0].to_host()
         text_embeddings = torch.from_numpy(text_embeddings)
     return text_embeddings
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 462798bf4..1d3056499 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -8,6 +8,7 @@
     PNDMScheduler,
     EulerDiscreteScheduler,
     EulerAncestralDiscreteScheduler,
+    DPMSolverSDEScheduler,
 )
 
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
@@ -20,13 +21,14 @@
         "--iree-opt-data-tiling=false",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-rocm-waves-per-eu=2",
+        "--iree-flow-inline-constants-max-byte-length=1",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
     "unet": [
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
-        "--iree-codegen-llvmgpu-use-vector-distribution=false",        
+        "--iree-codegen-llvmgpu-use-vector-distribution=false",
     ],
     "clip": [
         "--iree-flow-enable-aggressive-fusion",
@@ -35,7 +37,7 @@
     ],
     "vae": [
         "--iree-flow-enable-aggressive-fusion",
-        "--iree-codegen-llvmgpu-use-vector-distribution=true"
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
     ],
 }
 GFX11_flags = {
@@ -128,9 +130,7 @@ def compile_to_vmfb(
     debug = False
     if debug:
         flags.extend(
-            [
-                "--iree-hal-dump-executable-files-to=" + safe_name + "_dispatches"
-            ]
+            ["--iree-hal-dump-executable-files-to=" + safe_name + "_dispatches"]
         )
 
     for i, flag in enumerate(ireec_flags):
@@ -273,4 +273,8 @@ def get_schedulers(model_id):
         model_id,
         subfolder="scheduler",
     )
+    schedulers["DPMSolverSDE"] = DPMSolverSDEScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
     return schedulers
diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index c38d1a818..a6d030356 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -74,22 +74,22 @@
 
 sdxl_turbo_sched_unet_bench_f16 = """
 module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<1x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<1x64x2048xf16>, %arg2: tensor<1x1280xf16>, %arg3: tensor<1x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<?x4x?x?xf16>) -> (tensor<?x4x?x?xf16>, tensor<?x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<?x4x?x?xf16>, %arg1: tensor<?x64x2048xf16>, %arg2: tensor<?x1280xf16>, %arg3: tensor<?x6xf16>, %arg4: tensor<?xf16>, %arg5: tensor<1xi64>) -> tensor<?x4x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
   
-  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf16>, %p_embeds: tensor<1x64x2048xf16>, %t_embeds: tensor<1x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x128x128xf16> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<1x6xf16>, tensor<i64>)
+  func.func @produce_image_latents(%sample: tensor<?x4x?x?xf16>, %p_embeds: tensor<?x64x2048xf16>, %t_embeds: tensor<?x1280xf16>, %guidance_scale: tensor<?xf16>) -> tensor<?x4x?x?xf16> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<?x4x?x?xf16>) -> (tensor<?x4x?x?xf16>, tensor<?x6xf16>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<?x4x?x?xf16>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<1x64x2048xf16>, tensor<1x1280xf16>, tensor<1x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
-      scf.yield %inner : tensor<1x4x128x128xf16>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<?x4x?x?xf16>, tensor<?x64x2048xf16>, tensor<?x1280xf16>, tensor<?x6xf16>, tensor<?xf16>, tensor<1xi64>) -> tensor<?x4x?x?xf16>
+      scf.yield %inner : tensor<?x4x?x?xf16>
     }
-    return %res : tensor<1x4x128x128xf16>
+    return %res : tensor<?x4x?x?xf16>
   } 
 }
 """
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 6cdcbb242..6dc5b0dcc 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -102,7 +102,8 @@ def __init__(
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
         self.custom_vae = custom_vae
-        self.do_classifier_free_guidance = False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
+        # TODO: set this based on user-inputted guidance scale and negative prompt.
+        # self.do_classifier_free_guidance = False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
     # FILE MANAGEMENT AND PIPELINE SETUP
 
@@ -135,7 +136,7 @@ def check_prepared(
                         weights[submodel] = weight
                 ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
                 if ready:
-                    print("All necessary files found. Generating images.")
+                    print("All necessary files found.")
                     return vmfbs, weights
                 else:
                     print("There was an error generating the necessary files.")
@@ -164,7 +165,7 @@ def is_prepared(self, vmfbs, weights):
         for w_key in weights:
             if "pipeline" in w_key:
                 continue
-            if weights[w_key] is not None and os.path.exists(weights[w_key]):
+            if weights[w_key] is not None:
                 continue
             default_name = os.path.join(
                 self.external_weights_dir, w_key + "." + self.external_weights
@@ -356,7 +357,9 @@ def export_submodel(
                     else sdxl_sched_unet_bench_f16
                 )
                 if self.do_classifier_free_guidance == False:
-                    assert self.precision == "fp16", "turbo only supported in fp16 precision."
+                    assert (
+                        self.precision == "fp16"
+                    ), "turbo only supported in fp16 precision."
                     pipeline_file = sdxl_turbo_sched_unet_bench_f16
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
@@ -545,11 +548,18 @@ def generate_images(
         else:
             encode_prompts_start = time.time()
 
-            prompt_embeds, add_text_embeds = self.runners[
-                "prompt_encoder"
-            ].ctx.modules.compiled_clip["encode_prompts"](
-                *text_input_ids_list, *uncond_input_ids_list
-            )
+            if self.do_classifier_free_guidance == False:
+                prompt_embeds, add_text_embeds = self.runners[
+                    "prompt_encoder"
+                ].ctx.modules.compiled_clip["encode_prompts_turbo"](
+                    *text_input_ids_list
+                )
+            else:
+                prompt_embeds, add_text_embeds = self.runners[
+                    "prompt_encoder"
+                ].ctx.modules.compiled_clip["encode_prompts"](
+                    *text_input_ids_list, *uncond_input_ids_list
+                )
 
             encode_prompts_end = time.time()
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index ad17ab82e..be962ac5f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -37,7 +37,7 @@ def __init__(
             subfolder="text_encoder_2",
             token=hf_auth_token,
         )
-        self.do_classifier_free_guidance = False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
+        self.do_classifier_free_guidance = True
 
     def forward(
         self, text_input_ids_1, text_input_ids_2, uncond_input_ids_1, uncond_input_ids_2
@@ -98,6 +98,43 @@ def forward(
             prompt_embeds = prompt_embeds.to(self.torch_dtype)
             return prompt_embeds, add_text_embeds
 
+    def forward_turbo(self, text_input_ids_1, text_input_ids_2):
+        with torch.no_grad():
+            prompt_embeds_1 = self.text_encoder_model_1(
+                text_input_ids_1,
+                output_hidden_states=True,
+            )
+            prompt_embeds_2 = self.text_encoder_model_2(
+                text_input_ids_2,
+                output_hidden_states=True,
+            )
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds_2[0]
+
+            prompt_embeds_list = [
+                prompt_embeds_1.hidden_states[-2],
+                prompt_embeds_2.hidden_states[-2],
+            ]
+            # neg_prompt_embeds_list = [
+            #     torch.zeros_like(prompt_embeds_list[0]), # dummy tensor
+            #     torch.zeros_like(prompt_embeds_list[1]), # dummy tensor
+            # ]
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+            bs_embed, seq_len, _ = prompt_embeds.shape
+
+            prompt_embeds = prompt_embeds.repeat(1, 1, 1)
+            prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)
+            pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(
+                bs_embed * 1, -1
+            )
+            add_text_embeds = pooled_prompt_embeds
+
+            add_text_embeds = add_text_embeds.to(self.torch_dtype)
+            prompt_embeds = prompt_embeds.to(self.torch_dtype)
+            return prompt_embeds, add_text_embeds
+
 
 def export_prompt_encoder(
     hf_model_name,
@@ -189,6 +226,13 @@ def encode_prompts(
                 t_ids_1, t_ids_2, uc_ids_1, uc_ids_2
             )
 
+        def encode_prompts_turbo(
+            self,
+            t_ids_1=AbstractTensor(1, max_length, dtype=torch.int64),
+            t_ids_2=AbstractTensor(1, max_length, dtype=torch.int64),
+        ):
+            return jittable(prompt_encoder_module.forward_turbo)(t_ids_1, t_ids_2)
+
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
     inst = CompiledClip(context=Context(), import_to=import_to)
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index e2f6f8b8a..b25487b21 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -16,6 +16,7 @@
 from shark_turbine.aot import *
 import shark_turbine.ops as ops
 from turbine_models.custom_models.sd_inference import utils
+from turbine_models.custom_models.sd_inference.schedulers import get_scheduler
 import torch
 import torch._dynamo as dynamo
 from diffusers import UNet2DConditionModel
@@ -39,12 +40,12 @@ def __init__(
     ):
         super().__init__()
         self.do_classifier_free_guidance = True
-        if any(key in hf_model_name for key in ["turbo", "lightning"]):
-            self.do_classifier_free_guidance = False 
+        # if any(key in hf_model_name for key in ["turbo", "lightning"]):
+        #     self.do_classifier_free_guidance = False
         self.dtype = torch.float16 if precision == "fp16" else torch.float32
         self.scheduler = utils.get_schedulers(hf_model_name)[scheduler_id]
-        if scheduler_id == "PNDM":
-            num_inference_steps = num_inference_steps - 1
+        # if scheduler_id == "PNDM":
+        #     num_inference_steps = num_inference_steps - 1
         self.scheduler.set_timesteps(num_inference_steps)
         self.scheduler.is_scale_input_called = True
         self.return_index = return_index
@@ -101,9 +102,11 @@ def forward(
             if self.do_classifier_free_guidance:
                 latent_model_input = torch.cat([sample] * 2)
             else:
-                latent_model_input = sample.clone()
+                latent_model_input = sample
 
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t
+            ).type(self.dtype)
             noise_pred = self.unet.forward(
                 latent_model_input,
                 t,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
index 8945d274a..cc0c9791c 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
@@ -50,106 +50,9 @@ def run_torch_scheduled_unet(
     text_embeds,
     args,
 ):
-    from diffusers import UNet2DConditionModel
-
-    class SDXLScheduledUnet(torch.nn.Module):
-        def __init__(
-            self,
-            hf_model_name,
-            scheduler_id,
-            height,
-            width,
-            batch_size,
-            hf_auth_token=None,
-            precision="fp32",
-            num_inference_steps=1,
-            return_index=False,
-        ):
-            super().__init__()
-            self.dtype = torch.float16 if precision == "fp16" else torch.float32
-            self.scheduler = utils.get_schedulers(hf_model_name)[scheduler_id]
-            self.scheduler.set_timesteps(num_inference_steps)
-            self.scheduler.is_scale_input_called = True
-            self.return_index = return_index
-
-            if precision == "fp16":
-                try:
-                    self.unet = UNet2DConditionModel.from_pretrained(
-                        hf_model_name,
-                        subfolder="unet",
-                        auth_token=hf_auth_token,
-                        low_cpu_mem_usage=False,
-                        variant="fp16",
-                    )
-                except:
-                    self.unet = UNet2DConditionModel.from_pretrained(
-                        hf_model_name,
-                        subfolder="unet",
-                        auth_token=hf_auth_token,
-                        low_cpu_mem_usage=False,
-                    )
-            else:
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    hf_model_name,
-                    subfolder="unet",
-                    auth_token=hf_auth_token,
-                    low_cpu_mem_usage=False,
-                )
-
-        def initialize(self, sample):
-            height = sample.shape[-2] * 8
-            width = sample.shape[-1] * 8
-            original_size = (height, width)
-            target_size = (height, width)
-            crops_coords_top_left = (0, 0)
-            add_time_ids = list(original_size + crops_coords_top_left + target_size)
-            add_time_ids = torch.tensor([add_time_ids])
-            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
-            add_time_ids = add_time_ids.repeat(sample.shape[0], 1).type(self.dtype)
-            timesteps = self.scheduler.timesteps
-            step_indexes = torch.tensor(len(timesteps))
-            sample = sample * self.scheduler.init_noise_sigma
-            return sample.type(self.dtype), add_time_ids, step_indexes
-
-        def forward(
-            self,
-            sample,
-            prompt_embeds,
-            text_embeds,
-            time_ids,
-            guidance_scale,
-            step_index,
-        ):
-            with torch.no_grad():
-                added_cond_kwargs = {
-                    "text_embeds": text_embeds,
-                    "time_ids": time_ids,
-                }
-                t = self.scheduler.timesteps[step_index]
-                latent_model_input = torch.cat([sample] * 2)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t
-                )
-                noise_pred = self.unet.forward(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=None,
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=False,
-                )[0]
-
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
-                sample = self.scheduler.step(noise_pred, t, sample, return_dict=False)[
-                    0
-                ]
-            if self.return_index:
-                return sample.type(self.dtype), step_index
-            else:
-                return sample.type(self.dtype)
+    from turbine_models.custom_models.sdxl_inference.sdxl_scheduled_unet import (
+        SDXLScheduledUnet,
+    )
 
     unet_model = SDXLScheduledUnet(
         args.hf_model_name,
@@ -158,9 +61,9 @@ def forward(
         args.width,
         args.batch_size,
         args.hf_auth_token,
-        args.precision,
+        "fp32",
         args.num_inference_steps,
-    )
+    ).float()
     sample, add_time_ids, steps = unet_model.initialize(sample)
     for i in range(steps):
         sample = unet_model.forward(
@@ -263,12 +166,20 @@ def run_torch_diffusers_loop(
         dtype = torch.float16
     else:
         dtype = torch.float32
+    # if "turbo" in args.hf_model_name:
+    #     init_batch_dim = 1
+    # else:
+    #     init_batch_dim = 2
+    init_batch_dim = 2
     sample = torch.rand(
         args.batch_size, 4, args.height // 8, args.width // 8, dtype=dtype
     )
     timestep = torch.zeros(1, dtype=torch.int64)
-    prompt_embeds = torch.rand(2 * args.batch_size, args.max_length, 2048, dtype=dtype)
-    text_embeds = torch.rand(2 * args.batch_size, 1280, dtype=dtype)
+    prompt_embeds = torch.rand(
+        init_batch_dim * args.batch_size, args.max_length, 2048, dtype=dtype
+    )
+    text_embeds = torch.rand(init_batch_dim * args.batch_size, 1280, dtype=dtype)
+    time_ids = torch.rand(init_batch_dim * args.batch_size, 6)
 
     turbine_output = run_scheduled_unet(
         sample,
diff --git a/models/turbine_models/model_runner.py b/models/turbine_models/model_runner.py
index a173f3166..1b27ca83b 100644
--- a/models/turbine_models/model_runner.py
+++ b/models/turbine_models/model_runner.py
@@ -72,8 +72,9 @@ def __init__(self, device, vmfb_path, external_weight_path=None, extra_plugin=No
                     self.config.vm_instance, index.create_provider(scope="model")
                 )
                 vm_modules.insert(i, param_module)
+                del param_module
             del index
-            del param_module
+
         self.ctx = ireert.SystemContext(
             vm_modules=vm_modules,
             config=self.config,

From 10a8ceb7740fbb72f46a8b69e146f3ee032658f7 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 28 May 2024 18:53:53 -0500
Subject: [PATCH 081/174] Fix test instantiation of scheduling model.

---
 models/turbine_models/tests/sd_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index 7482c374f..b0829103f 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -71,6 +71,7 @@
     default_arguments["height"],
     default_arguments["width"],
     default_arguments["num_inference_steps"],
+    default_arguments["precision"],
 )
 
 

From 253a14eee8a60e2517b574b23b07314c23d551a9 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 28 May 2024 19:47:59 -0500
Subject: [PATCH 082/174] fixes to sd tests

---
 .../custom_models/sd_inference/clip.py        |  8 +--
 models/turbine_models/tests/sd_test.py        | 57 +++++++++++--------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index a15426d28..52c36a5c3 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -180,10 +180,8 @@ def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
 if __name__ == "__main__":
     from .sd_cmd_opts import args
 
-    mod_str, _ = export_clip(
+    mod_str, _ = export_clip_model(
         args.hf_model_name,
-        args.hf_auth_token,
-        args.batch_size,
         args.max_length,
         args.precision,
         args.compile_to,
@@ -195,7 +193,9 @@ def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
         exit_on_vmfb=True,
         pipeline_dir=args.pipeline_dir,
         input_mlir=args.input_mlir,
-        attn_spec=args.attn_spec,
+        td_spec=args.attn_spec,
+        weights_only=False,
+        upload_ir=False,
     )
     if args.input_mlir:
         exit()
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index b0829103f..7af7dcb10 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -80,15 +80,16 @@ class StableDiffusionTest(unittest.TestCase):
     def testExportT5Model(self):
         current_args = copy.deepcopy(default_arguments)
         current_args["hf_model_name"] = "google/t5-v1_1-small"
-        safe_prefix = "t5_v1_1_small"
         blob_name = clip.export_clip_model(
             hf_model_name=current_args["hf_model_name"],
-            hf_auth_token=None,
+            max_length=64,
+            precision=current_args["precision"],
             compile_to="vmfb",
             external_weights=None,
             external_weight_path=None,
             device="cpu",
             target_triple=None,
+            exit_on_vmfb=False,
             upload_ir=UPLOAD_IR,
         )
         current_args["vmfb_path"] = blob_name
@@ -119,12 +120,14 @@ def testExportClipVitLarge14(self):
         safe_prefix = "clip_vit_large_patch14"
         blob_name = clip.export_clip_model(
             hf_model_name=current_args["hf_model_name"],
-            hf_auth_token=None,
+            max_length=64,
+            precision=current_args["precision"],
             compile_to="vmfb",
             external_weights="safetensors",
             external_weight_path=safe_prefix + ".safetensors",
             device="cpu",
             target_triple=None,
+            exit_on_vmfb=False,
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = safe_prefix + ".safetensors"
@@ -156,13 +159,15 @@ def testExportClipModel(self):
         current_args = copy.deepcopy(default_arguments)
         current_args["hf_model_name"] = "CompVis/stable-diffusion-v1-4"
         blob_name = clip.export_clip_model(
-            # This is a public model, so no auth required
-            "CompVis/stable-diffusion-v1-4",
-            None,
-            "vmfb",
-            "safetensors",
-            "stable_diffusion_v1_4_clip.safetensors",
-            "cpu",
+            hf_model_name=current_args["hf_model_name"],
+            max_length=64,
+            precision=current_args["precision"],
+            compile_to="vmfb",
+            external_weights="safetensors",
+            external_weight_path=safe_prefix + ".safetensors",
+            device="cpu",
+            target_triple=None,
+            exit_on_vmfb=False,
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors"
@@ -194,7 +199,7 @@ def testExportUnetModel(self):
         current_args = copy.deepcopy(default_arguments)
         blob_name = unet.export_unet_model(
             unet_model,
-            "CompVis/stable-diffusion-v1-4",
+            current_args["hf_model_name"],
             current_args["batch_size"],
             current_args["height"],
             current_args["width"],
@@ -203,12 +208,12 @@ def testExportUnetModel(self):
             None,
             "vmfb",
             "safetensors",
-            "stable_diffusion_v1_4_unet.safetensors",
+            "stable_diffusion_unet.safetensors",
             "cpu",
             upload_ir=UPLOAD_IR,
         )
-        current_args["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors"
-        current_args["vmfb_path"] = "stable_diffusion_v1_4_unet.vmfb"
+        current_args["external_weight_path"] = "stable_diffusion_unet.safetensors"
+        current_args["vmfb_path"] = blob_name
         sample = torch.rand(
             current_args["batch_size"],
             current_args["in_channels"],
@@ -219,9 +224,13 @@ def testExportUnetModel(self):
 
         timestep = torch.zeros(1, dtype=torch.float32)
         if current_args["hf_model_name"] == "CompVis/stable-diffusion-v1-4":
-            encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32)
+            encoder_hidden_states = torch.rand(
+                2, current_args["max_length"], 768, dtype=torch.float32
+            )
         elif current_args["hf_model_name"] == "stabilityai/stable-diffusion-2-1-base":
-            encoder_hidden_states = torch.rand(2, 77, 1024, dtype=torch.float32)
+            encoder_hidden_states = torch.rand(
+                2, current_args["max_length"], 1024, dtype=torch.float32
+            )
         guidance_scale = torch.tensor(
             [current_args["guidance_scale"]], dtype=torch.float32
         )
@@ -251,8 +260,8 @@ def testExportUnetModel(self):
             new_blob_name = blob_name.split(".")
             new_blob_name = new_blob_name[0] + "-pass.mlir"
             turbine_tank.changeBlobName(blob_name, new_blob_name)
-        os.remove("stable_diffusion_v1_4_unet.safetensors")
-        os.remove("stable_diffusion_v1_4_unet.vmfb")
+        os.remove("stable_diffusion_unet.safetensors")
+        os.remove(blob_name)
         del torch_output
         del turbine
 
@@ -260,12 +269,11 @@ def testExportVaeModelDecode(self):
         current_args = copy.deepcopy(default_arguments)
         blob_name = vae.export_vae_model(
             vae_model,
-            # This is a public model, so no auth required
-            "CompVis/stable-diffusion-v1-4",
+            current_args["hf_model_name"],
             current_args["batch_size"],
             current_args["height"],
             current_args["width"],
-            None,
+            current_args["precision"],
             "vmfb",
             "safetensors",
             "stable_diffusion_v1_4_vae.safetensors",
@@ -303,14 +311,13 @@ def testExportVaeModelDecode(self):
         del torch_output
         del turbine
         os.remove("stable_diffusion_v1_4_vae.safetensors")
-        os.remove("stable_diffusion_v1_4_vae.vmfb")
+        os.remove("blob_name")
 
     def testExportVaeModelEncode(self):
         current_args = copy.deepcopy(default_arguments)
         blob_name = vae.export_vae_model(
             vae_model,
-            # This is a public model, so no auth required
-            "CompVis/stable-diffusion-v1-4",
+            current_args["hf_model_name"],
             current_args["batch_size"],
             current_args["height"],
             current_args["width"],
@@ -350,7 +357,7 @@ def testExportVaeModelEncode(self):
             new_blob_name = new_blob_name[0] + "-pass.mlir"
             turbine_tank.changeBlobName(blob_name, new_blob_name)
         os.remove("stable_diffusion_v1_4_vae.safetensors")
-        os.remove("stable_diffusion_v1_4_vae.vmfb")
+        os.remove(blob_name)
 
     @unittest.expectedFailure
     def testExportPNDMScheduler(self):

From 3986a1eb21c2ae60b0ffbde5a04bbc6b409c8f8a Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 28 May 2024 20:28:12 -0500
Subject: [PATCH 083/174] fixup commented attr

---
 .../custom_models/sdxl_inference/sdxl_compiled_pipeline.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 6dc5b0dcc..376fae77f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -103,7 +103,7 @@ def __init__(
         self.vae_decomp_attn = vae_decomp_attn
         self.custom_vae = custom_vae
         # TODO: set this based on user-inputted guidance scale and negative prompt.
-        # self.do_classifier_free_guidance = False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
+        self.do_classifier_free_guidance = True #False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
     # FILE MANAGEMENT AND PIPELINE SETUP
 

From 50c6c0d97baa56e6ad2cd07558052369af5eea3d Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 29 May 2024 01:19:17 -0500
Subject: [PATCH 084/174] Add DpmSolverSDE scheduler and fix formatting

---
 .../custom_models/sd_inference/schedulers.py              | 8 ++++++--
 .../sdxl_inference/sdxl_compiled_pipeline.py              | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 5cc37c8b5..f0ad8a848 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -18,6 +18,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
     DDPMScheduler,
+    DPMSolverSDEScheduler,
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     KDPM2DiscreteScheduler,
@@ -243,15 +244,18 @@ def get_scheduler(model_id, scheduler_id):
     ] = DPMSolverMultistepScheduler.from_pretrained(
         model_id,
         subfolder="scheduler",
-        use_karras_sigmas=True,
     )
+    schedulers["DPMSolverMultistepKarras"].config.use_karras_sigmas = True
     schedulers[
         "DPMSolverMultistepKarras++"
     ] = DPMSolverMultistepScheduler.from_pretrained(
         model_id,
         subfolder="scheduler",
         algorithm_type="dpmsolver++",
-        use_karras_sigmas=True,
+    )
+    schedulers["DPMSolverMultistepKarras++"].config.use_karras_sigmas = True
+    schedulers["DPMSolverSDE"] = DPMSolverSDEScheduler.from_pretrained(
+        model_id, subfolder="scheduler"
     )
     return schedulers[scheduler_id]
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 376fae77f..6b3c7c4f3 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -103,7 +103,7 @@ def __init__(
         self.vae_decomp_attn = vae_decomp_attn
         self.custom_vae = custom_vae
         # TODO: set this based on user-inputted guidance scale and negative prompt.
-        self.do_classifier_free_guidance = True #False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
+        self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
     # FILE MANAGEMENT AND PIPELINE SETUP
 

From 4424d99733fce42b5c7af479b2d06e7dc7be6860 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Wed, 29 May 2024 13:58:15 -0500
Subject: [PATCH 085/174] Remove one more CFG conditional from unet export.

---
 .../sdxl_inference/sdxl_scheduled_unet.py             | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index b25487b21..5f929ddda 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -149,11 +149,11 @@ def export_scheduled_unet_model(
     input_mlir=None,
     weights_only=False,
 ):
-    if "turbo" in hf_model_name:
-        do_classifier_free_guidance = False
-    else:
-        do_classifier_free_guidance = True
-
+    # if "turbo" in hf_model_name:
+    #     do_classifier_free_guidance = False
+    # else:
+    #     do_classifier_free_guidance = True
+    do_classifier_free_guidance = True
     if pipeline_dir:
         safe_name = os.path.join(
             pipeline_dir, f"{scheduler_id}_unet_{str(num_inference_steps)}"
@@ -213,6 +213,7 @@ def export_scheduled_unet_model(
     time_ids_shape = (init_batch_dim * batch_size, 6)
     prompt_embeds_shape = (init_batch_dim * batch_size, max_length, 2048)
     text_embeds_shape = (init_batch_dim * batch_size, 1280)
+    breakpoint()
 
     class CompiledScheduledUnet(CompiledModule):
         if external_weights:

From e94b8dd218ba84b43bbcd693f3de25388697ae1f Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Wed, 29 May 2024 14:02:01 -0500
Subject: [PATCH 086/174] Remove breakpoint

---
 .../custom_models/sdxl_inference/sdxl_scheduled_unet.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 5f929ddda..576ec3e92 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -213,7 +213,6 @@ def export_scheduled_unet_model(
     time_ids_shape = (init_batch_dim * batch_size, 6)
     prompt_embeds_shape = (init_batch_dim * batch_size, max_length, 2048)
     text_embeds_shape = (init_batch_dim * batch_size, 1280)
-    breakpoint()
 
     class CompiledScheduledUnet(CompiledModule):
         if external_weights:

From ecf824b37ce8765abc1f4720f9e764ac25d684bc Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Wed, 29 May 2024 14:03:35 -0500
Subject: [PATCH 087/174] Get wmma spec for gfx1150

---
 models/turbine_models/custom_models/sd_inference/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 1d3056499..41b6d8fbd 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -214,7 +214,7 @@ def get_mfma_spec_path(target_chip, save_dir):
 def get_wmma_spec_path(target_chip, save_dir):
     if target_chip == "gfx1100":
         url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1100.spec.mlir"
-    elif target_chip == "gfx1103":
+    elif target_chip in ["gfx1103", "gfx1150"]:
         url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1103.spec.mlir"
     else:
         return None

From 45040bf7832f139f3dce68edb8369bedb1e32889 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Wed, 29 May 2024 15:44:49 -0500
Subject: [PATCH 088/174] Make sizes dynamic for some in pipeline IR

---
 .../custom_models/sd_inference/utils.py       | 12 ++--
 .../sdxl_inference/pipeline_ir.py             | 72 +++++++++----------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 41b6d8fbd..7cdad9cd2 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -43,11 +43,11 @@
 GFX11_flags = {
     "all": [
         "--iree-global-opt-propagate-transposes=true",
-        "--iree-opt-outer-dim-concat=true",
-        "--iree-vm-target-truncate-unsupported-floats",
-        "--iree-llvmgpu-enable-prefetch=true",
-        "--iree-opt-data-tiling=false",
-        "--iree-codegen-gpu-native-math-precision=true",
+        #"--iree-opt-outer-dim-concat=true",
+        #"--iree-vm-target-truncate-unsupported-floats",
+        #"--iree-llvmgpu-enable-prefetch=true",
+        #"--iree-opt-data-tiling=false",
+        #"--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
@@ -78,7 +78,7 @@ def compile_to_vmfb(
             raise ValueError(
                 "target_triple must be set. Usually this can be fixed by setting --iree_target_triple in the CLI."
             )
-    if device == "cpu":
+    if device in ["cpu", "llvm-cpu"]:
         flags.extend(
             [
                 "--iree-llvmcpu-target-triple=" + target_triple,
diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index a6d030356..1bbada725 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -1,73 +1,73 @@
 sdxl_pipeline_bench_f16 = """
 module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
   func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x?x?xf16>) -> tensor<1x3x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
 
-  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
+  func.func @tokens_to_image(%sample: tensor<1x4x?x?xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x?x?xf16> {
     %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x?x?xf16>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
-      scf.yield %inner : tensor<1x4x128x128xf16>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x?x?xf16>
+      scf.yield %inner : tensor<1x4x?x?xf16>
     }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
-    return %image : tensor<1x3x1024x1024xf16>
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x?x?xf16>) -> tensor<1x3x?x?xf16>
+    return %image : tensor<1x3x?x?xf16>
   } 
 }
 """
 
 sdxl_pipeline_bench_f32 = """
 module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
   func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x?x?xf32>) -> tensor<1x3x?x?xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
 
-  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf32>, %guidance_scale: tensor<1xf32>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf32> {
+  func.func @tokens_to_image(%sample: tensor<1x4x?x?xf32>, %guidance_scale: tensor<1xf32>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x?x?xf32> {
     %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x?x?xf32>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
-      scf.yield %inner : tensor<1x4x128x128xf32>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x?x?xf32>
+      scf.yield %inner : tensor<1x4x?x?xf32>
     }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf32>) -> tensor<1x3x1024x1024xf32>
-    return %image : tensor<1x3x1024x1024xf32>
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x?x?xf32>) -> tensor<1x3x?x?xf32>
+    return %image : tensor<1x3x?x?xf32>
   } 
 }
 """
 
 sdxl_sched_unet_bench_f16 = """
 module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
   
-  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf16>, %p_embeds: tensor<2x64x2048xf16>, %t_embeds: tensor<2x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x128x128xf16> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+  func.func @produce_image_latents(%sample: tensor<1x4x?x?xf16>, %p_embeds: tensor<2x64x2048xf16>, %t_embeds: tensor<2x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x?x?xf16> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x?x?xf16>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
-      scf.yield %inner : tensor<1x4x128x128xf16>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x?x?xf16>
+      scf.yield %inner : tensor<1x4x?x?xf16>
     }
-    return %res : tensor<1x4x128x128xf16>
+    return %res : tensor<1x4x?x?xf16>
   } 
 }
 """
@@ -96,22 +96,22 @@
 
 sdxl_sched_unet_bench_f32 = """
 module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
   
-  func.func @produce_image_latents(%sample: tensor<1x4x128x128xf32>, %p_embeds: tensor<2x64x2048xf32>, %t_embeds: tensor<2x1280xf32>, %guidance_scale: tensor<1xf32>) -> tensor<1x4x128x128xf32> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf32>) -> (tensor<1x4x128x128xf32>, tensor<2x6xf32>, tensor<i64>)
+  func.func @produce_image_latents(%sample: tensor<1x4x?x?xf32>, %p_embeds: tensor<2x64x2048xf32>, %t_embeds: tensor<2x1280xf32>, %guidance_scale: tensor<1xf32>) -> tensor<1x4x?x?xf32> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg_s = %noisy_sample) -> (tensor<1x4x128x128xf32>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg_s = %noisy_sample) -> (tensor<1x4x?x?xf32>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg_s, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x128x128xf32>
-      scf.yield %inner : tensor<1x4x128x128xf32>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg_s, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x?x?xf32>
+      scf.yield %inner : tensor<1x4x?x?xf32>
     }
-    return %res : tensor<1x4x128x128xf32>
+    return %res : tensor<1x4x?x?xf32>
   } 
 }
 """

From 5ddd898f29dda518f7aafb68fdb3ba9db208e95e Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Wed, 29 May 2024 22:36:05 -0500
Subject: [PATCH 089/174] Fixes for inlined weights

---
 .../custom_models/sd_inference/utils.py       | 19 +++++++++++++------
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 18 ++++++++----------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 7cdad9cd2..ab4881fca 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -43,13 +43,16 @@
 GFX11_flags = {
     "all": [
         "--iree-global-opt-propagate-transposes=true",
-        #"--iree-opt-outer-dim-concat=true",
-        #"--iree-vm-target-truncate-unsupported-floats",
-        #"--iree-llvmgpu-enable-prefetch=true",
-        #"--iree-opt-data-tiling=false",
-        #"--iree-codegen-gpu-native-math-precision=true",
+        "--iree-opt-outer-dim-concat=true",
+        "--iree-vm-target-truncate-unsupported-floats",
+        "--iree-llvmgpu-enable-prefetch=true",
+        "--iree-opt-data-tiling=false",
+        "--iree-opt-aggressively-propagate-transposes=true",
+        "--iree-flow-enable-aggressive-fusion",
+        "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+        "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
-        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
     "unet": [""],
     "clip": [""],
@@ -107,6 +110,10 @@ def compile_to_vmfb(
                 "--iree-hal-target-backends=rocm",
                 "--iree-rocm-target-chip=" + target_triple,
                 "--iree-opt-const-eval=false",
+                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                "--iree-stream-resource-max-allocation-size=4294967296",
+                "--iree-opt-strip-assertions=true",
+                "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
             ]
         )
         if target_triple == "gfx942":
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 6b3c7c4f3..44ee411bb 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -167,6 +167,8 @@ def is_prepared(self, vmfbs, weights):
                 continue
             if weights[w_key] is not None:
                 continue
+            if self.external_weights is None:
+                continue
             default_name = os.path.join(
                 self.external_weights_dir, w_key + "." + self.external_weights
             )
@@ -230,7 +232,7 @@ def export_submodel(
     ):
         if not os.path.exists(self.pipeline_dir):
             os.makedirs(self.pipeline_dir)
-        if self.external_weights_dir:
+        if self.external_weights and self.external_weights_dir:
             if not os.path.exists(self.external_weights_dir):
                 os.makedirs(external_weights_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
@@ -418,15 +420,11 @@ def load_pipeline(
         else:
             runners["pipe"] = vmfbRunner(
                 rt_device,
-                [vmfbs["scheduled_unet"], vmfbs["pipeline"]],
-                [weights["scheduled_unet"], None],
-            )
-            runners["vae_decode"] = vmfbRunner(
-                rt_device, vmfbs["vae_decode"], weights["vae_decode"]
-            )
-            runners["prompt_encoder"] = vmfbRunner(
-                rt_device, vmfbs["prompt_encoder"], weights["prompt_encoder"]
+                [vmfbs["scheduled_unet"], vmfbs["pipeline"], vmfbs["vae_decode"], vmfbs["prompt_encoder"]],
+                [weights["scheduled_unet"], None, weights["vae_decode"], weights["prompt_encoder"]],
             )
+            runners["vae_decode"] = runners['pipe']
+            runners["prompt_encoder"] = runners['pipe']
         runners["tokenizer_1"] = CLIPTokenizer.from_pretrained(
             self.hf_model_name,
             subfolder="tokenizer",
@@ -451,7 +449,7 @@ def generate_images(
         return_imgs: bool = False,
     ):
         # TODO: implement case where this is false e.g. in SDXL Turbo
-        # do_classifier_free_guidance = True
+        do_classifier_free_guidance = True
 
         iree_dtype = "float32" if self.precision == "fp32" else "float16"
         torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16

From 2def6fb1eb53395306eff09b584110182d543816 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Wed, 29 May 2024 23:47:13 -0500
Subject: [PATCH 090/174] Fixes to inlined weights, llm compilation flexibility

---
 models/requirements.txt                       |   1 +
 .../custom_models/sd_inference/utils.py       |   4 +
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  18 +++-
 .../custom_models/stateless_llama.py          | 102 +++++++-----------
 4 files changed, 55 insertions(+), 70 deletions(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index 1718afb5d..6744d3238 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -5,6 +5,7 @@ transformers==4.37.1
 torchsde
 accelerate
 diffusers @ git+https://github.com/nod-ai/diffusers@v0.24.0-release
+brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
 azure-storage-blob
 # microsoft/phi model
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index ab4881fca..2db9afd3c 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -74,6 +74,8 @@ def compile_to_vmfb(
     attn_spec=None,
 ):
     flags = []
+    if mlir_source == "file" and not isinstance(module_str, str):
+        module_str = str(module_str)
     if target_triple in ["", None]:
         if device == "cpu":
             target_triple = "x86_64-linux-gnu"
@@ -89,6 +91,8 @@ def compile_to_vmfb(
                 "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
                 "--iree-llvmcpu-distribution-size=32",
                 "--iree-opt-const-eval=false",
+                "--iree-llvmcpu-enable-ukernels=all",
+                "--iree-global-opt-enable-quantized-matmul-reassociation",
             ]
         )
         device = "llvm-cpu"
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 44ee411bb..e3b619b06 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -420,11 +420,21 @@ def load_pipeline(
         else:
             runners["pipe"] = vmfbRunner(
                 rt_device,
-                [vmfbs["scheduled_unet"], vmfbs["pipeline"], vmfbs["vae_decode"], vmfbs["prompt_encoder"]],
-                [weights["scheduled_unet"], None, weights["vae_decode"], weights["prompt_encoder"]],
+                [
+                    vmfbs["scheduled_unet"],
+                    vmfbs["pipeline"],
+                    vmfbs["vae_decode"],
+                    vmfbs["prompt_encoder"],
+                ],
+                [
+                    weights["scheduled_unet"],
+                    None,
+                    weights["vae_decode"],
+                    weights["prompt_encoder"],
+                ],
             )
-            runners["vae_decode"] = runners['pipe']
-            runners["prompt_encoder"] = runners['pipe']
+            runners["vae_decode"] = runners["pipe"]
+            runners["prompt_encoder"] = runners["pipe"]
         runners["tokenizer_1"] = CLIPTokenizer.from_pretrained(
             self.hf_model_name,
             subfolder="tokenizer",
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index c3f8a9050..208728927 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -14,6 +14,7 @@
 from turbine_models.custom_models.llm_optimizations.streaming_llm.modify_llama import (
     enable_llama_pos_shift_attention,
 )
+from turbine_models.custom_models.sd_inference.utils import compile_to_vmfb
 
 from turbine_models.custom_models import remap_gguf
 import safetensors
@@ -130,7 +131,31 @@ def export_transformer_model(
     mod=None,
     tokenizer=None,
     decomp_attn=False,
+    input_mlir=None,
 ):
+    safe_name = hf_model_name.split("/")[-1].strip()
+    safe_name = re.sub("-", "_", safe_name)
+    if not vmfb_path:
+        vmfb_path = safe_name + "_" + target_triple
+        if streaming_llm:
+            vmfb_path += "_streaming"
+    iree_flags = []
+    ukernel_supported_arch = {"gfx90a", "gfx940", "gfx1030", "gfx1100"}
+    if target_triple in ukernel_supported_arch:
+        iree_flags.extend(["--iree-rocm-enable-ukernels=argmax"])
+    if input_mlir is not None:
+        vmfb_path = compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags=iree_flags,
+            safe_name=vmfb_path.split(".vmfb")[0],
+            return_path=True,
+            const_expr_hoisting=True,
+            mlir_source="file",
+            save_mlir=False,
+            attn_spec="mfma" if "gfx9" in target_triple else "wmma",
+        )
     if tokenizer == None:
         tokenizer = AutoTokenizer.from_pretrained(
             hf_model_name,
@@ -429,8 +454,6 @@ def evict_kvcache_space(self):
             CompiledModule.get_mlir_module(inst).operation
         ).run()
     module_str = str(CompiledModule.get_mlir_module(inst))
-    safe_name = hf_model_name.split("/")[-1].strip()
-    safe_name = re.sub("-", "_", safe_name)
     if upload_ir:
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
@@ -442,74 +465,21 @@ def evict_kvcache_space(self):
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
-        flags = [
-            "--iree-input-type=torch",
-            "--mlir-print-debuginfo",
-            "--mlir-print-op-on-diagnostic=false",
-            "--iree-llvmcpu-target-cpu-features=host",
-            "--iree-llvmcpu-target-triple=x86_64-linux-gnu",
-            "--iree-stream-resource-index-bits=64",
-            "--iree-vm-target-index-bits=64",
-        ]
-        if device == "cpu" or device == "llvm-cpu":
-            flags.append("--iree-llvmcpu-enable-ukernels=all")
-            device = "llvm-cpu"
-        elif device == "vulkan":
-            flags.extend(
-                [
-                    "--iree-vulkan-target-triple=" + target_triple,
-                    "--iree-stream-resource-max-allocation-size="
-                    + vulkan_max_allocation,
-                ]
-            )
-        elif device == "rocm":
-            flags.extend(
-                [
-                    "--iree-rocm-target-chip=" + target_triple,
-                    "--iree-opt-strip-assertions=true",
-                    "--iree-vm-target-truncate-unsupported-floats",
-                ]
-            )
-            ukernel_supported_arch = {"gfx90a", "gfx940", "gfx1030", "gfx1100"}
-            if target_triple in ukernel_supported_arch:
-                flags.extend(
-                    [
-                        "--iree-rocm-enable-ukernels=argmax",
-                        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-preprocessing-pad-to-intrinsics))",
-                        "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
-                    ]
-                )
-                if os.path.exists("llama_argmax_td_spec.mlir"):
-                    flags.extend(
-                        [
-                            "--iree-preprocessing-transform-spec-filename=llama_argmax_td_spec.mlir",
-                        ]
-                    )
-        elif device == "cuda":
-            flags.extend(
-                [
-                    "--iree-hal-cuda-llvm-target-arch=" + target_triple,
-                    "--iree-vm-bytecode-module-strip-source-map=true",
-                    "--iree-vm-target-truncate-unsupported-floats",
-                ]
-            )
-        else:
-            print("Unknown device kind: ", device)
-        import iree.compiler as ireec
-
-        flatbuffer_blob = ireec.compile_str(
+        blob_name = compile_to_vmfb(
             module_str,
-            target_backends=[device],
-            extra_args=flags,
+            device,
+            target_triple,
+            ireec_flags=iree_flags,
+            safe_name=vmfb_path.split(".vmfb")[0],
+            return_path=True,
+            const_expr_hoisting=True,
+            mlir_source="str",
+            save_mlir=False,
+            attn_spec="mfma" if "gfx9" in target_triple else "wmma",
         )
-        if vmfb_path is None:
-            vmfb_path = f"{safe_name}.vmfb"
-        with open(vmfb_path, "wb+") as f:
-            f.write(flatbuffer_blob)
-        print("saved to ", safe_name + ".vmfb")
         if upload_ir:
             return blob_name
-        return module_str, tokenizer
+        return blob_name, tokenizer
 
 
 if __name__ == "__main__":

From 5df2716223801e944c5eec4c55193286ac73677e Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 30 May 2024 01:55:37 -0500
Subject: [PATCH 091/174] Add default triple to stateless_llama

---
 models/turbine_models/custom_models/stateless_llama.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 208728927..e9ca17e10 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -123,7 +123,7 @@ def export_transformer_model(
     quantization=None,
     precision=None,
     device=None,
-    target_triple=None,
+    target_triple="x86_64-unknown-linux-gnu",
     vulkan_max_allocation=None,
     streaming_llm=False,
     vmfb_path=None,
@@ -133,12 +133,12 @@ def export_transformer_model(
     decomp_attn=False,
     input_mlir=None,
 ):
-    safe_name = hf_model_name.split("/")[-1].strip()
-    safe_name = re.sub("-", "_", safe_name)
+    safe_name = hf_model_name.replace("-", "_").replace("/", "_")
+    if streaming_llm:
+        safe_name += "_streaming"
     if not vmfb_path:
         vmfb_path = safe_name + "_" + target_triple
-        if streaming_llm:
-            vmfb_path += "_streaming"
+
     iree_flags = []
     ukernel_supported_arch = {"gfx90a", "gfx940", "gfx1030", "gfx1100"}
     if target_triple in ukernel_supported_arch:

From 68c7a571aa4c6b19e3fa440fc0b7170c2d35537c Mon Sep 17 00:00:00 2001
From: ean garvey <ean.garvey@amd.com>
Date: Thu, 30 May 2024 13:25:17 -0400
Subject: [PATCH 092/174] Switch vecdist back on for instinct

---
 models/turbine_models/custom_models/sd_inference/utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 2db9afd3c..7c9a5f521 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -28,7 +28,7 @@
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-opt-aggressively-propagate-transposes=true",
-        "--iree-codegen-llvmgpu-use-vector-distribution=false",
+        "--iree-codegen-llvmgpu-use-vector-distribution=true",
     ],
     "clip": [
         "--iree-flow-enable-aggressive-fusion",
@@ -115,9 +115,6 @@ def compile_to_vmfb(
                 "--iree-rocm-target-chip=" + target_triple,
                 "--iree-opt-const-eval=false",
                 "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                "--iree-stream-resource-max-allocation-size=4294967296",
-                "--iree-opt-strip-assertions=true",
-                "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
             ]
         )
         if target_triple == "gfx942":
@@ -217,6 +214,8 @@ def get_mfma_spec_path(target_chip, save_dir):
     url = "https://raw.githubusercontent.com/iree-org/iree/main/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir"
     attn_spec = urlopen(url).read().decode("utf-8")
     spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
+    if os.path.exists(spec_path):
+        return spec_path
     with open(spec_path, "w") as f:
         f.write(attn_spec)
     return spec_path

From 258be080161a0f307e12f10ddbb645ea8838a658 Mon Sep 17 00:00:00 2001
From: ean garvey <ean.garvey@amd.com>
Date: Thu, 30 May 2024 14:36:12 -0400
Subject: [PATCH 093/174] Workaround prompt/neg prompt switching for turbo mode

---
 .../custom_models/sdxl_inference/sdxl_compiled_pipeline.py   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index e3b619b06..9437f2045 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -461,6 +461,11 @@ def generate_images(
         # TODO: implement case where this is false e.g. in SDXL Turbo
         do_classifier_free_guidance = True
 
+        # Workaround for turbo support (guidance_scale 0)
+        if guidance_scale == 0:
+            negative_prompt = prompt
+            prompt = ""
+
         iree_dtype = "float32" if self.precision == "fp32" else "float16"
         torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
 

From a4f2391b45b806ec212a0a4d2747ded7e0b5a21d Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 30 May 2024 18:22:17 -0500
Subject: [PATCH 094/174] Inlined weights fix for sd1.5/2.1

---
 .../turbine_models/custom_models/sd_inference/sd_pipeline.py   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 7b0411ec7..22f36ca6d 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -156,6 +156,9 @@ def is_prepared(self, vmfbs, weights):
                 continue
             if weights[w_key] is not None and os.path.exists(weights[w_key]):
                 continue
+            if external_weights is None:
+                weights[w_key] = None
+                continue
             default_name = os.path.join(
                 self.external_weights_dir, w_key + "." + self.external_weights
             )

From d7dabfa156f1fe96fd3d097112f89d46f3838510 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 31 May 2024 13:38:39 -0500
Subject: [PATCH 095/174] Fix inline weights again.

---
 models/turbine_models/custom_models/sd_inference/sd_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 22f36ca6d..3975bfbbb 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -156,7 +156,7 @@ def is_prepared(self, vmfbs, weights):
                 continue
             if weights[w_key] is not None and os.path.exists(weights[w_key]):
                 continue
-            if external_weights is None:
+            if self.external_weights is None:
                 weights[w_key] = None
                 continue
             default_name = os.path.join(

From e8dcd8af0227816f5187e0e83baee52e62e15e41 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 31 May 2024 14:50:48 -0500
Subject: [PATCH 096/174] Make getter for static pipeline IRs.

---
 .../sdxl_inference/pipeline_ir.py             | 137 ++----
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  33 +-
 .../custom_models/stateless_llama.py          | 460 +++++++++---------
 3 files changed, 296 insertions(+), 334 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index 1bbada725..0fa22b5c2 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -1,117 +1,74 @@
-sdxl_pipeline_bench_f16 = """
+tokens_to_image = r"""
 module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x?x?xf16>) -> tensor<1x3x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %arg1: tensor<{batch_size*2}x{max_length}x2048x{precision}>, %arg2: tensor<{batch_size*2}x1280x{precision}>, %arg3: tensor<{batch_size*2}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<{batch_size}xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<{batch_size}x{max_length}xi64>, %arg1: tensor<{batch_size}x{max_length}xi64>, %arg2: tensor<{batch_size}x{max_length}xi64>, %arg3: tensor<{batch_size}x{max_length}xi64>) -> (tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
 
-  func.func @tokens_to_image(%sample: tensor<1x4x?x?xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x?x?xf16> {
-    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>)
+  func.func @tokens_to_image(%sample: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>, %t_ids_1: tensor<{batch_size}x{max_length}xi64>, %t_ids_2: tensor<{batch_size}x{max_length}xi64>, %u_ids_1: tensor<{batch_size}x{max_length}xi64>, %u_ids_2: tensor<{batch_size}x{max_length}xi64>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>) -> (tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x?x?xf16>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x?x?xf16>
-      scf.yield %inner : tensor<1x4x?x?xf16>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
+      scf.yield %inner : tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
     }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x?x?xf16>) -> tensor<1x3x?x?xf16>
-    return %image : tensor<1x3x?x?xf16>
+    %image = func.call @compiled_vae.main(%res): (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}>
+    return %image : tensor<{batch_size}x3x{width}x{height}x{precision}>
   } 
 }
 """
 
-sdxl_pipeline_bench_f32 = """
+unet_loop = r"""
 module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<1x4x?x?xf32>) -> tensor<1x3x?x?xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-
-  func.func @tokens_to_image(%sample: tensor<1x4x?x?xf32>, %guidance_scale: tensor<1xf32>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x?x?xf32> {
-    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf32>, tensor<2x1280xf32>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x?x?xf32>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x?x?xf32>
-      scf.yield %inner : tensor<1x4x?x?xf32>
-    }
-    %image = func.call @compiled_vae.main(%res): (tensor<1x4x?x?xf32>) -> tensor<1x3x?x?xf32>
-    return %image : tensor<1x3x?x?xf32>
-  } 
-}
-"""
-
-sdxl_sched_unet_bench_f16 = """
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %arg1: tensor<{batch_size*2}x{max_length}x2048x{precision}>, %arg2: tensor<{batch_size*2}x1280x{precision}>, %arg3: tensor<{batch_size*2}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<1xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
   
-  func.func @produce_image_latents(%sample: tensor<1x4x?x?xf16>, %p_embeds: tensor<2x64x2048xf16>, %t_embeds: tensor<2x1280xf16>, %guidance_scale: tensor<1xf16>) -> tensor<1x4x?x?xf16> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf16>) -> (tensor<1x4x?x?xf16>, tensor<2x6xf16>, tensor<i64>)
+  func.func @produce_image_latents(%sample: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %p_embeds: tensor<{batch_size*2}x{max_length}x2048x{precision}>, %t_embeds: tensor<{batch_size*2}x1280x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x?x?xf16>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x?x?xf16>
-      scf.yield %inner : tensor<1x4x?x?xf16>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
+      scf.yield %inner : tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
     }
-    return %res : tensor<1x4x?x?xf16>
+    return %res : tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
   } 
 }
 """
 
-sdxl_turbo_sched_unet_bench_f16 = """
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<?x4x?x?xf16>) -> (tensor<?x4x?x?xf16>, tensor<?x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<?x4x?x?xf16>, %arg1: tensor<?x64x2048xf16>, %arg2: tensor<?x1280xf16>, %arg3: tensor<?x6xf16>, %arg4: tensor<?xf16>, %arg5: tensor<1xi64>) -> tensor<?x4x?x?xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  
-  func.func @produce_image_latents(%sample: tensor<?x4x?x?xf16>, %p_embeds: tensor<?x64x2048xf16>, %t_embeds: tensor<?x1280xf16>, %guidance_scale: tensor<?xf16>) -> tensor<?x4x?x?xf16> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<?x4x?x?xf16>) -> (tensor<?x4x?x?xf16>, tensor<?x6xf16>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<?x4x?x?xf16>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<?x4x?x?xf16>, tensor<?x64x2048xf16>, tensor<?x1280xf16>, tensor<?x6xf16>, tensor<?xf16>, tensor<1xi64>) -> tensor<?x4x?x?xf16>
-      scf.yield %inner : tensor<?x4x?x?xf16>
-    }
-    return %res : tensor<?x4x?x?xf16>
-  } 
-}
-"""
 
-sdxl_sched_unet_bench_f32 = """
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x?x?xf32>, %arg1: tensor<2x64x2048xf32>, %arg2: tensor<2x1280xf32>, %arg3: tensor<2x6xf32>, %arg4: tensor<1xf32>, %arg5: tensor<1xi64>) -> tensor<1x4x?x?xf32> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  
-  func.func @produce_image_latents(%sample: tensor<1x4x?x?xf32>, %p_embeds: tensor<2x64x2048xf32>, %t_embeds: tensor<2x1280xf32>, %guidance_scale: tensor<1xf32>) -> tensor<1x4x?x?xf32> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x?x?xf32>) -> (tensor<1x4x?x?xf32>, tensor<2x6xf32>, tensor<i64>)
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %steps_int = tensor.extract %steps[] : tensor<i64>
-    %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg_s = %noisy_sample) -> (tensor<1x4x?x?xf32>) {
-      %step_64 = arith.index_cast %arg0 : index to i64
-      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg_s, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x?x?xf32>, tensor<2x64x2048xf32>, tensor<2x1280xf32>, tensor<2x6xf32>, tensor<1xf32>, tensor<1xi64>) -> tensor<1x4x?x?xf32>
-      scf.yield %inner : tensor<1x4x?x?xf32>
-    }
-    return %res : tensor<1x4x?x?xf32>
-  } 
-}
-"""
+def get_pipeline_ir(
+    width: int,
+    height: int,
+    precision: str,
+    batch_size: int,
+    max_length: int,
+    type: str,
+):
+    precision = "f32" if precision == "fp32" else "f16"
+    if type == "tokens_to_image":
+        return tokens_to_image.format(
+            width=width,
+            height=height,
+            precision=precision,
+            batch_size=batch_size,
+            max_length=max_length,
+        )
+    elif type == "unet_loop":
+        return unet_loop.format(
+            width=width,
+            height=height,
+            precision=precision,
+            batch_size=batch_size,
+            max_length=max_length,
+        )
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 9437f2045..bac2f2031 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -14,11 +14,7 @@
 import iree.runtime as ireert
 from turbine_models.custom_models.sd_inference import utils
 from turbine_models.custom_models.sdxl_inference.pipeline_ir import (
-    sdxl_sched_unet_bench_f32,
-    sdxl_sched_unet_bench_f16,
-    sdxl_turbo_sched_unet_bench_f16,
-    sdxl_pipeline_bench_f32,
-    sdxl_pipeline_bench_f16,
+    get_pipeline_ir,
 )
 from turbine_models.utils.sdxl_benchmark import run_benchmark
 from turbine_models.model_runner import vmfbRunner
@@ -353,16 +349,14 @@ def export_submodel(
                 )
                 return prompt_encoder_vmfb, prompt_encoder_external_weight_path
             case "pipeline":
-                pipeline_file = (
-                    sdxl_sched_unet_bench_f32
-                    if self.precision == "fp32"
-                    else sdxl_sched_unet_bench_f16
+                pipeline_file = get_pipeline_ir(
+                    self.width,
+                    self.height,
+                    self.precision,
+                    self.batch_size,
+                    self.max_length,
+                    "unet_loop",
                 )
-                if self.do_classifier_free_guidance == False:
-                    assert (
-                        self.precision == "fp16"
-                    ), "turbo only supported in fp16 precision."
-                    pipeline_file = sdxl_turbo_sched_unet_bench_f16
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
                     self.device,
@@ -374,10 +368,13 @@ def export_submodel(
                 )
                 return pipeline_vmfb, None
             case "full_pipeline":
-                pipeline_file = (
-                    sdxl_pipeline_bench_f32
-                    if self.precision == "fp32"
-                    else sdxl_pipeline_bench_f16
+                pipeline_file = get_pipeline_ir(
+                    self.width,
+                    self.height,
+                    self.precision,
+                    self.batch_size,
+                    self.max_length,
+                    "tokens_to_image",
                 )
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index e9ca17e10..170302f6b 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -208,252 +208,260 @@ def export_transformer_model(
             mapper = tensor_mapper.mapping
 
     initial_table = decompositions.current_aot_decompositions()
-    if decomp_attn == True:
-        with decompositions.extend_aot_decompositions(from_current=True) as init_t:
-            with decompositions.extend_aot_decompositions(
-                add_ops=[
-                    torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-                    torch.ops.aten._scaled_dot_product_flash_attention.default,
-                ]
-            ):
-                current_table = decompositions.current_aot_decompositions()
-                assert len(current_table) == len(initial_table) + 1
-
-    class StateUpdateModule(CompiledModule):
-        if external_weights:
-            params = export_parameters(
-                mod, external=True, external_scope="", name_mapper=mapper.get
-            )
-        else:
-            params = export_parameters(mod)
-        global_seq_step = export_global(AbstractIndex, mutable=True)
-        global_k_caches = export_global_tree(
-            kv_cache_structure, uninitialized=True, mutable=True
-        )
-        global_v_caches = export_global_tree(
-            kv_cache_structure, uninitialized=True, mutable=True
-        )
+    print("Decomposing torch SDPA")
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=[
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.masked_fill_.Scalar,
+            torch.ops.aten.copy,
+        ],
+    ):
+        current_table = decompositions.current_aot_decompositions()
 
-        def run_initialize(self, x=AbstractTensor(BATCH_SIZE, None, dtype=torch.int64)):
-            init_const = [x.dynamic_dim(1) < MAX_STEP_SEQ]
-            token, *state = self.initialize(x, constraints=init_const)
-            self.global_seq_step = IREE.tensor_dim(
-                state[0], 1
-            )  # ? dimension of arbitrarily 0th kv tensor
-            for i in range(NUM_LAYERS):
-                slice_of_state = IREE.tensor_reshape(
-                    state[i * 2], 1, self.global_seq_step, HEADS, HIDDEN_DIM
+        class StateUpdateModule(CompiledModule):
+            if external_weights:
+                params = export_parameters(
+                    mod, external=True, external_scope="", name_mapper=mapper.get
                 )
-                self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_k_caches["layer_idx"][i], slice_of_state, 0, 0, 0, 0
-                )
-            for i in range(NUM_LAYERS):
-                slice_of_state = IREE.tensor_reshape(
-                    state[i * 2 + 1], 1, self.global_seq_step, HEADS, HIDDEN_DIM
-                )
-                self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_v_caches["layer_idx"][i], slice_of_state, 0, 0, 0, 0
-                )
-            return token
-
-        def run_forward(self, x=AbstractTensor(1, 1, dtype=torch.int64)):
-            state_arg = slice_up_to_step(
-                self.global_k_caches,
-                self.global_v_caches,
-                self.global_seq_step,
-                HEADS,
-                HIDDEN_DIM,
-                NUM_LAYERS,
+            else:
+                params = export_parameters(mod)
+            global_seq_step = export_global(AbstractIndex, mutable=True)
+            global_k_caches = export_global_tree(
+                kv_cache_structure, uninitialized=True, mutable=True
             )
-            forw_const = (
-                [state_arg[0].dynamic_dim(1) < MAX_STEP_SEQ]
-                + [
-                    x.dynamic_dim(1) == (state_arg[0].dynamic_dim(1))
-                    for x in state_arg[1:]
-                ]
-                + [x.dynamic_dim(1) < MAX_STEP_SEQ for x in state_arg[1:]]
+            global_v_caches = export_global_tree(
+                kv_cache_structure, uninitialized=True, mutable=True
             )
-            token, *state_update = self.forward(x, *state_arg, constraints=forw_const)
-            for i in range(NUM_LAYERS):
-                update = IREE.tensor_reshape(
-                    state_update[i * 2], 1, 1, HEADS, HIDDEN_DIM
-                )
-                self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_k_caches["layer_idx"][i],
-                    update,
-                    0,
+
+            def run_initialize(
+                self, x=AbstractTensor(BATCH_SIZE, None, dtype=torch.int64)
+            ):
+                init_const = [x.dynamic_dim(1) < MAX_STEP_SEQ]
+                token, *state = self.initialize(x, constraints=init_const)
+                self.global_seq_step = IREE.tensor_dim(
+                    state[0], 1
+                )  # ? dimension of arbitrarily 0th kv tensor
+                for i in range(NUM_LAYERS):
+                    slice_of_state = IREE.tensor_reshape(
+                        state[i * 2], 1, self.global_seq_step, HEADS, HIDDEN_DIM
+                    )
+                    self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_k_caches["layer_idx"][i], slice_of_state, 0, 0, 0, 0
+                    )
+                for i in range(NUM_LAYERS):
+                    slice_of_state = IREE.tensor_reshape(
+                        state[i * 2 + 1], 1, self.global_seq_step, HEADS, HIDDEN_DIM
+                    )
+                    self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_v_caches["layer_idx"][i], slice_of_state, 0, 0, 0, 0
+                    )
+                return token
+
+            def run_forward(self, x=AbstractTensor(1, 1, dtype=torch.int64)):
+                state_arg = slice_up_to_step(
+                    self.global_k_caches,
+                    self.global_v_caches,
                     self.global_seq_step,
-                    0,
-                    0,
+                    HEADS,
+                    HIDDEN_DIM,
+                    NUM_LAYERS,
                 )
-            for i in range(NUM_LAYERS):
-                update = IREE.tensor_reshape(
-                    state_update[i * 2 + 1], 1, 1, HEADS, HIDDEN_DIM
+                forw_const = (
+                    [state_arg[0].dynamic_dim(1) < MAX_STEP_SEQ]
+                    + [
+                        x.dynamic_dim(1) == (state_arg[0].dynamic_dim(1))
+                        for x in state_arg[1:]
+                    ]
+                    + [x.dynamic_dim(1) < MAX_STEP_SEQ for x in state_arg[1:]]
                 )
-                self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_v_caches["layer_idx"][i],
-                    update,
-                    0,
-                    self.global_seq_step,
-                    0,
-                    0,
+                token, *state_update = self.forward(
+                    x, *state_arg, constraints=forw_const
                 )
-            self.global_seq_step = self.global_seq_step + 1
-            return token
+                for i in range(NUM_LAYERS):
+                    update = IREE.tensor_reshape(
+                        state_update[i * 2], 1, 1, HEADS, HIDDEN_DIM
+                    )
+                    self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_k_caches["layer_idx"][i],
+                        update,
+                        0,
+                        self.global_seq_step,
+                        0,
+                        0,
+                    )
+                for i in range(NUM_LAYERS):
+                    update = IREE.tensor_reshape(
+                        state_update[i * 2 + 1], 1, 1, HEADS, HIDDEN_DIM
+                    )
+                    self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_v_caches["layer_idx"][i],
+                        update,
+                        0,
+                        self.global_seq_step,
+                        0,
+                        0,
+                    )
+                self.global_seq_step = self.global_seq_step + 1
+                return token
 
-        def get_seq_step(self):
-            return self.global_seq_step
+            def get_seq_step(self):
+                return self.global_seq_step
 
-        @jittable
-        def initialize(input_ids):
-            result = mod.forward(input_ids)
-            state1_flat, _ = pytree.tree_flatten(result.past_key_values)
-            token1 = torch.argmax(result.logits[:, -1, :], dim=1)
-            token1 = token1[None, :]
-            state1_flat = [torch.transpose(x, 1, 2) for x in state1_flat]
-            return token1, *state1_flat
+            @jittable
+            def initialize(input_ids):
+                result = mod.forward(input_ids)
+                state1_flat, _ = pytree.tree_flatten(result.past_key_values)
+                token1 = torch.argmax(result.logits[:, -1, :], dim=1)
+                token1 = token1[None, :]
+                state1_flat = [torch.transpose(x, 1, 2) for x in state1_flat]
+                return token1, *state1_flat
 
-        @jittable
-        def forward(token0: torch.Tensor, *state0_flat):
-            # Unpad the states.
-            state0_flat = [torch.transpose(x, 1, 2) for x in state0_flat]
-            state0 = pytree.tree_unflatten(state0_flat, state_schema)
-            result = mod.forward(token0, past_key_values=state0)
-            state1_flat, _ = pytree.tree_flatten(result.past_key_values)
-            state1_flat = [torch.transpose(x[:, :, -1:, :], 1, 2) for x in state1_flat]
-            token1 = torch.argmax(result.logits[:, -1, :], dim=1)
-            token1 = token1[None, :]
-            return token1, *state1_flat
-
-    class StreamingStateUpdateModule(StateUpdateModule):
-        def run_cached_initialize(
-            self, x=AbstractTensor(BATCH_SIZE, None, dtype=torch.int64)
-        ):
-            state_arg = slice_up_to_step(
-                self.global_k_caches,
-                self.global_v_caches,
-                self.global_seq_step,
-                HEADS,
-                HIDDEN_DIM,
-                NUM_LAYERS,
-            )
-            forw_const = (
-                [x.dynamic_dim(1) < MAX_STEP_SEQ]
-                + [state_arg[0].dynamic_dim(1) < MAX_STEP_SEQ]
-                + [
-                    x.dynamic_dim(1) == (state_arg[0].dynamic_dim(1))
-                    for x in state_arg[1:]
+            @jittable
+            def forward(token0: torch.Tensor, *state0_flat):
+                # Unpad the states.
+                state0_flat = [torch.transpose(x, 1, 2) for x in state0_flat]
+                state0 = pytree.tree_unflatten(state0_flat, state_schema)
+                result = mod.forward(token0, past_key_values=state0)
+                state1_flat, _ = pytree.tree_flatten(result.past_key_values)
+                state1_flat = [
+                    torch.transpose(x[:, :, -1:, :], 1, 2) for x in state1_flat
                 ]
-                + [x.dynamic_dim(1) < MAX_STEP_SEQ for x in state_arg[1:]]
-            )
-            token, *state = self.cached_initialize(
-                x, *state_arg, constraints=forw_const
-            )
-            len_of_new_tokens = IREE.tensor_dim(
-                state[0], 1
-            )  # ? dimension of arbitrarily 0th kv tensor
-            for i in range(NUM_LAYERS):
-                slice_of_state = IREE.tensor_reshape(
-                    state[i * 2], 1, len_of_new_tokens, HEADS, HIDDEN_DIM
-                )
-                self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_k_caches["layer_idx"][i],
-                    slice_of_state,
-                    0,
+                token1 = torch.argmax(result.logits[:, -1, :], dim=1)
+                token1 = token1[None, :]
+                return token1, *state1_flat
+
+        class StreamingStateUpdateModule(StateUpdateModule):
+            def run_cached_initialize(
+                self, x=AbstractTensor(BATCH_SIZE, None, dtype=torch.int64)
+            ):
+                state_arg = slice_up_to_step(
+                    self.global_k_caches,
+                    self.global_v_caches,
                     self.global_seq_step,
-                    0,
-                    0,
+                    HEADS,
+                    HIDDEN_DIM,
+                    NUM_LAYERS,
                 )
-            for i in range(NUM_LAYERS):
-                slice_of_state = IREE.tensor_reshape(
-                    state[i * 2 + 1], 1, len_of_new_tokens, HEADS, HIDDEN_DIM
+                forw_const = (
+                    [x.dynamic_dim(1) < MAX_STEP_SEQ]
+                    + [state_arg[0].dynamic_dim(1) < MAX_STEP_SEQ]
+                    + [
+                        x.dynamic_dim(1) == (state_arg[0].dynamic_dim(1))
+                        for x in state_arg[1:]
+                    ]
+                    + [x.dynamic_dim(1) < MAX_STEP_SEQ for x in state_arg[1:]]
                 )
-                self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_v_caches["layer_idx"][i],
-                    slice_of_state,
-                    0,
-                    self.global_seq_step,
-                    0,
-                    0,
+                token, *state = self.cached_initialize(
+                    x, *state_arg, constraints=forw_const
                 )
-            self.global_seq_step = self.global_seq_step + len_of_new_tokens
-            return token
+                len_of_new_tokens = IREE.tensor_dim(
+                    state[0], 1
+                )  # ? dimension of arbitrarily 0th kv tensor
+                for i in range(NUM_LAYERS):
+                    slice_of_state = IREE.tensor_reshape(
+                        state[i * 2], 1, len_of_new_tokens, HEADS, HIDDEN_DIM
+                    )
+                    self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_k_caches["layer_idx"][i],
+                        slice_of_state,
+                        0,
+                        self.global_seq_step,
+                        0,
+                        0,
+                    )
+                for i in range(NUM_LAYERS):
+                    slice_of_state = IREE.tensor_reshape(
+                        state[i * 2 + 1], 1, len_of_new_tokens, HEADS, HIDDEN_DIM
+                    )
+                    self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_v_caches["layer_idx"][i],
+                        slice_of_state,
+                        0,
+                        self.global_seq_step,
+                        0,
+                        0,
+                    )
+                self.global_seq_step = self.global_seq_step + len_of_new_tokens
+                return token
 
-        @jittable
-        def cached_initialize(input_ids, *state0_flat):
-            # Unpad the states.
-            cur_token_len = state0_flat[0].size(1)
-            state0_flat = [torch.transpose(x, 1, 2) for x in state0_flat]
-            state0 = pytree.tree_unflatten(state0_flat, state_schema)
-            result = mod.forward(input_ids, past_key_values=state0)
-            state1_flat, _ = pytree.tree_flatten(result.past_key_values)
-            state1_flat = [
-                torch.transpose(x[:, :, cur_token_len:, :], 1, 2) for x in state1_flat
-            ]
-            token1 = torch.argmax(result.logits[:, -1, :], dim=1)
-            token1 = token1[None, :]
-            return token1, *state1_flat
+            @jittable
+            def cached_initialize(input_ids, *state0_flat):
+                # Unpad the states.
+                cur_token_len = state0_flat[0].size(1)
+                state0_flat = [torch.transpose(x, 1, 2) for x in state0_flat]
+                state0 = pytree.tree_unflatten(state0_flat, state_schema)
+                result = mod.forward(input_ids, past_key_values=state0)
+                state1_flat, _ = pytree.tree_flatten(result.past_key_values)
+                state1_flat = [
+                    torch.transpose(x[:, :, cur_token_len:, :], 1, 2)
+                    for x in state1_flat
+                ]
+                token1 = torch.argmax(result.logits[:, -1, :], dim=1)
+                token1 = token1[None, :]
+                return token1, *state1_flat
 
-        # Streaming-LLM KVCache evict algorithm:
-        # slice1 = KVCache[0 : sink]
-        # slice2 = KVCache[seq_len - window_size : seq_len]
-        # KVCache = torch.cat([slice1, slice2])
-        # TODO: Add move to handle overlap of data.
-        def evict_kvcache_space(self):
-            # TODO: Replace hardcoded with global variable.
-            sink_size = 4
-            window_size = 252
-            most_recent_window = self.global_seq_step + (-window_size)
-            for i in range(NUM_LAYERS):
-                update_window_state = IREE.tensor_slice(
-                    self.global_k_caches["layer_idx"][i],
-                    0,
-                    (most_recent_window, window_size),
-                    (0, HEADS),
-                    (0, HIDDEN_DIM),
-                )  # sequence context dim
-                self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_k_caches["layer_idx"][i],
-                    update_window_state,
-                    0,
-                    sink_size,
-                    0,
-                    0,
-                )
-            for i in range(NUM_LAYERS):
-                update_window_state = IREE.tensor_slice(
-                    self.global_v_caches["layer_idx"][i],
-                    0,
-                    (most_recent_window, window_size),
-                    (0, HEADS),
-                    (0, HIDDEN_DIM),
-                )  # sequence context dim
-                self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
-                    self.global_v_caches["layer_idx"][i],
-                    update_window_state,
-                    0,
-                    sink_size,
-                    0,
-                    0,
-                )
-            self.global_seq_step.set(window_size + sink_size)
-            return self.global_seq_step
+            # Streaming-LLM KVCache evict algorithm:
+            # slice1 = KVCache[0 : sink]
+            # slice2 = KVCache[seq_len - window_size : seq_len]
+            # KVCache = torch.cat([slice1, slice2])
+            # TODO: Add move to handle overlap of data.
+            def evict_kvcache_space(self):
+                # TODO: Replace hardcoded with global variable.
+                sink_size = 4
+                window_size = 252
+                most_recent_window = self.global_seq_step + (-window_size)
+                for i in range(NUM_LAYERS):
+                    update_window_state = IREE.tensor_slice(
+                        self.global_k_caches["layer_idx"][i],
+                        0,
+                        (most_recent_window, window_size),
+                        (0, HEADS),
+                        (0, HIDDEN_DIM),
+                    )  # sequence context dim
+                    self.global_k_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_k_caches["layer_idx"][i],
+                        update_window_state,
+                        0,
+                        sink_size,
+                        0,
+                        0,
+                    )
+                for i in range(NUM_LAYERS):
+                    update_window_state = IREE.tensor_slice(
+                        self.global_v_caches["layer_idx"][i],
+                        0,
+                        (most_recent_window, window_size),
+                        (0, HEADS),
+                        (0, HIDDEN_DIM),
+                    )  # sequence context dim
+                    self.global_v_caches["layer_idx"][i] = IREE.tensor_update(
+                        self.global_v_caches["layer_idx"][i],
+                        update_window_state,
+                        0,
+                        sink_size,
+                        0,
+                        0,
+                    )
+                self.global_seq_step.set(window_size + sink_size)
+                return self.global_seq_step
 
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    if streaming_llm:
-        print("Compiling with Streaming LLM")
-        inst = StreamingStateUpdateModule(context=Context(), import_to=import_to)
-    else:
-        inst = StateUpdateModule(context=Context(), import_to=import_to)
-    # TODO: Integrate with external parameters to actually be able to run
-    # TODO: Make more generalizable to be able to quantize with all  compile_to options
-    if quantization == "int4" and not compile_to == "linalg":
-        from shark_turbine.transforms.quantization import mm_group_quant
+        import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
+        if streaming_llm:
+            print("Compiling with Streaming LLM")
+            inst = StreamingStateUpdateModule(context=Context(), import_to=import_to)
+        else:
+            inst = StateUpdateModule(context=Context(), import_to=import_to)
+        # TODO: Integrate with external parameters to actually be able to run
+        # TODO: Make more generalizable to be able to quantize with all  compile_to options
+        if quantization == "int4" and not compile_to == "linalg":
+            from shark_turbine.transforms.quantization import mm_group_quant
 
-        mm_group_quant.MMGroupQuantRewriterPass(
-            CompiledModule.get_mlir_module(inst).operation
-        ).run()
-    module_str = str(CompiledModule.get_mlir_module(inst))
+            mm_group_quant.MMGroupQuantRewriterPass(
+                CompiledModule.get_mlir_module(inst).operation
+            ).run()
+        module_str = str(CompiledModule.get_mlir_module(inst))
     if upload_ir:
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)

From 5dc4e3c6f50d5f75d4b7631a81e9606c2a677693 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 31 May 2024 14:53:09 -0500
Subject: [PATCH 097/174] Fix pipeline ir import from sdxl_scheduled_unet
 script

---
 .../sdxl_inference/sdxl_scheduled_unet.py     | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 576ec3e92..a4cc008f0 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -267,17 +267,15 @@ def run_forward(
 
 
 def export_pipeline_module(args):
-    from turbine_models.custom_models.sdxl_inference.pipeline_ir import (
-        sdxl_sched_unet_bench_f32,
-        sdxl_sched_unet_bench_f16,
-        sdxl_pipeline_bench_f32,
-        sdxl_pipeline_bench_f16,
-    )
+    from turbine_models.custom_models.sdxl_inference.pipeline_ir import get_pipeline_ir
 
-    pipeline_file = (
-        sdxl_sched_unet_bench_f32
-        if args.precision == "fp32"
-        else sdxl_sched_unet_bench_f16
+    pipeline_file = get_pipeline_ir(
+        args.width,
+        args.height,
+        args.precision,
+        args.batch_size,
+        args.max_length,
+        "unet_loop",
     )
     pipeline_vmfb = utils.compile_to_vmfb(
         pipeline_file,
@@ -288,8 +286,13 @@ def export_pipeline_module(args):
         return_path=True,
         mlir_source="str",
     )
-    full_pipeline_file = (
-        sdxl_pipeline_bench_f32 if args.precision == "fp32" else sdxl_pipeline_bench_f16
+    full_pipeline_file = get_pipeline_ir(
+        args.width,
+        args.height,
+        args.precision,
+        args.batch_size,
+        args.max_length,
+        "tokens_to_image",
     )
     full_pipeline_vmfb = utils.compile_to_vmfb(
         pipeline_file,

From c75ebe4992558b5ec3356f8ffe02bf7c4d5fdc50 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 31 May 2024 16:43:47 -0500
Subject: [PATCH 098/174] fix format() issue with pipeline IRs

---
 .../sdxl_inference/pipeline_ir.py             | 58 +++++++++++--------
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  2 +-
 2 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index 0fa22b5c2..cdb04b63d 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -1,49 +1,49 @@
 tokens_to_image = r"""
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %arg1: tensor<{batch_size*2}x{max_length}x2048x{precision}>, %arg2: tensor<{batch_size*2}x1280x{precision}>, %arg3: tensor<{batch_size*2}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<{batch_size}xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
-  func.func private @compiled_clip.encode_prompts(%arg0: tensor<{batch_size}x{max_length}xi64>, %arg1: tensor<{batch_size}x{max_length}xi64>, %arg2: tensor<{batch_size}x{max_length}xi64>, %arg3: tensor<{batch_size}x{max_length}xi64>) -> (tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_vae.main(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+module @sdxl_compiled_pipeline {{
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>) attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}]"}}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %arg1: tensor<{bd}x{max_length}x2048x{precision}>, %arg2: tensor<{bd}x1280x{precision}>, %arg3: tensor<{bd}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<{batch_size}xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<{batch_size}x{max_length}xi64>, %arg1: tensor<{batch_size}x{max_length}xi64>, %arg2: tensor<{batch_size}x{max_length}xi64>, %arg3: tensor<{batch_size}x{max_length}xi64>) -> (tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>) attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}]"}}
+  func.func private @{vae_fn_name}.main(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
 
-  func.func @tokens_to_image(%sample: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>, %t_ids_1: tensor<{batch_size}x{max_length}xi64>, %t_ids_2: tensor<{batch_size}x{max_length}xi64>, %u_ids_1: tensor<{batch_size}x{max_length}xi64>, %u_ids_2: tensor<{batch_size}x{max_length}xi64>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> {
-    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>) -> (tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>)
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>)
+  func.func @tokens_to_image(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>, %t_ids_1: tensor<{batch_size}x{max_length}xi64>, %t_ids_2: tensor<{batch_size}x{max_length}xi64>, %u_ids_1: tensor<{batch_size}x{max_length}xi64>, %u_ids_2: tensor<{batch_size}x{max_length}xi64>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> {{
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>) -> (tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) {{
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
-      scf.yield %inner : tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
-    }
-    %image = func.call @compiled_vae.main(%res): (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>, tensor<{bd}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}>
+      scf.yield %inner : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
+    }}
+    %image = func.call @{vae_fn_name}.main(%res): (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}>
     return %image : tensor<{batch_size}x3x{width}x{height}x{precision}>
-  } 
-}
+  }}
+}}
 """
 
 unet_loop = r"""
-module @sdxl_compiled_pipeline {
-  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %arg1: tensor<{batch_size*2}x{max_length}x2048x{precision}>, %arg2: tensor<{batch_size*2}x1280x{precision}>, %arg3: tensor<{batch_size*2}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<1xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+module @sdxl_compiled_pipeline {{
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>) attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}]"}}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %arg1: tensor<{bd}x{max_length}x2048x{precision}>, %arg2: tensor<{bd}x1280x{precision}>, %arg3: tensor<{bd}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
   
-  func.func @produce_image_latents(%sample: tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, %p_embeds: tensor<{batch_size*2}x{max_length}x2048x{precision}>, %t_embeds: tensor<{batch_size*2}x1280x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}> {
-    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<i64>)
+  func.func @produce_image_latents(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %p_embeds: tensor<{bd}x{max_length}x2048x{precision}>, %t_embeds: tensor<{bd}x1280x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> {
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) {
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>, tensor<{batch_size*2}x{max_length}x2048x{precision}>, tensor<{batch_size*2}x1280x{precision}>, tensor<{batch_size*2}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
-      scf.yield %inner : tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>, tensor<{bd}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}>
+      scf.yield %inner : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
     }
-    return %res : tensor<{batch_size}x4x{width/8}x{height/8}x{precision}>
+    return %res : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
   } 
-}
+}}
 """
 
 
@@ -54,20 +54,28 @@ def get_pipeline_ir(
     batch_size: int,
     max_length: int,
     type: str,
+    vae_fn_name: str = "compiled_vae",
 ):
     precision = "f32" if precision == "fp32" else "f16"
     if type == "tokens_to_image":
         return tokens_to_image.format(
             width=width,
             height=height,
+            lw=int(width / 8),
+            lh=int(height / 8),
+            bd=int(batch_size * 2),
             precision=precision,
             batch_size=batch_size,
             max_length=max_length,
+            vae_fn_name=vae_fn_name,
         )
     elif type == "unet_loop":
         return unet_loop.format(
             width=width,
             height=height,
+            lw=int(width / 8),
+            lh=int(height / 8),
+            bd=int(batch_size * 2),
             precision=precision,
             batch_size=batch_size,
             max_length=max_length,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index bac2f2031..e6612eea2 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -215,7 +215,7 @@ def get_torch_models(self, submodel):
                     custom_vae=(
                         "madebyollin/sdxl-vae-fp16-fix"
                         if self.precision == "fp16"
-                        else None
+                        else self.custom_vae
                     ),
                 )
                 return vae_torch

From a6796b2e553c9f501bcffd3594e1dc6e2688a9b2 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 31 May 2024 18:46:11 -0500
Subject: [PATCH 099/174] Turn on const eval for triples for which we inline
 weights.

---
 models/turbine_models/custom_models/sd_inference/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 7c9a5f521..bc92c5576 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -113,7 +113,6 @@ def compile_to_vmfb(
             [
                 "--iree-hal-target-backends=rocm",
                 "--iree-rocm-target-chip=" + target_triple,
-                "--iree-opt-const-eval=false",
                 "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
             ]
         )
@@ -162,6 +161,9 @@ def compile_to_vmfb(
     if target_triple in ["gfx1100", "gfx1103", "gfx1150"]:
         flags.extend(GFX11_flags["all"])
 
+    if target_triple not in ["gfx1103", "gfx1150"]:
+        flags.extend(["--iree-opt-const-eval=false"])
+
     # Currently, we need a transform dialect script to be applied to the compilation through IREE in certain cases.
     # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
     # This is a temporary solution, and should be removed or largely disabled once the functionality of

From 88b8df804bd38bb10b3d90cfbc0201dbca2b4d42 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 31 May 2024 19:04:04 -0500
Subject: [PATCH 100/174] More fixes to pipeline IR

---
 .../custom_models/sdxl_inference/pipeline_ir.py           | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index cdb04b63d..1bffadfa5 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -29,20 +29,20 @@
   func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>) attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}]"}}
   func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %arg1: tensor<{bd}x{max_length}x2048x{precision}>, %arg2: tensor<{bd}x1280x{precision}>, %arg3: tensor<{bd}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
   
-  func.func @produce_image_latents(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %p_embeds: tensor<{bd}x{max_length}x2048x{precision}>, %t_embeds: tensor<{bd}x1280x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> {
+  func.func @produce_image_latents(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %p_embeds: tensor<{bd}x{max_length}x2048x{precision}>, %t_embeds: tensor<{bd}x1280x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> {{
     %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %steps_int = tensor.extract %steps[] : tensor<i64>
     %n_steps = arith.index_cast %steps_int: i64 to index
-    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) {
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) {{
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
       %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>, tensor<{bd}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}>
       scf.yield %inner : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
-    }
+    }}
     return %res : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
-  } 
+  }}
 }}
 """
 

From 75cd61c878a04995e4b60829a7e39e05291b4cc6 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sun, 2 Jun 2024 00:04:48 -0500
Subject: [PATCH 101/174] Add flag to disable transform dialect jit for llvmgpu

---
 models/turbine_models/custom_models/sd_inference/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index bc92c5576..cfe40e0f4 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -52,6 +52,7 @@
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
+        '--iree-codegen-llvmgpu-enable-transform-dialect-jit=false',
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
     "unet": [""],

From f246282c60b79178c6c2ac5e58fee60952240229 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sun, 2 Jun 2024 10:04:25 -0500
Subject: [PATCH 102/174] Add znver4 compile options.

---
 .../custom_models/sd_inference/utils.py       | 51 ++++++++++++++-----
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index cfe40e0f4..e0ba69fea 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -39,6 +39,7 @@
         "--iree-flow-enable-aggressive-fusion",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
     ],
+    "winograd": [""],
 }
 GFX11_flags = {
     "all": [
@@ -58,8 +59,21 @@
     "unet": [""],
     "clip": [""],
     "vae": [""],
+    "winograd": [""],
+}
+znver4_flags = {
+    "all": [
+        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-global-opt-demote-contraction-inputs-to-bf16))",
+        "--iree-llvmcpu-target-cpu=znver4",
+        "--iree-llvmcpu-enable-ukernels=mmt4d,pack,unpack",
+        "--iree-flow-collapse-reduction-dims",
+        "--iree-opt-const-expr-max-size-increase-threshold=1000000000000000",
+    ],
+    "winograd": [
+        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))",
+        "--iree-flow-enable-fuse-padding-into-linalg-consumer-ops",
+    ],
 }
-
 
 def compile_to_vmfb(
     module_str,
@@ -73,6 +87,7 @@ def compile_to_vmfb(
     max_alloc="4294967296",
     save_mlir=True,
     attn_spec=None,
+    winograd=False,
 ):
     flags = []
     if mlir_source == "file" and not isinstance(module_str, str):
@@ -85,17 +100,22 @@ def compile_to_vmfb(
                 "target_triple must be set. Usually this can be fixed by setting --iree_target_triple in the CLI."
             )
     if device in ["cpu", "llvm-cpu"]:
-        flags.extend(
-            [
-                "--iree-llvmcpu-target-triple=" + target_triple,
-                "--iree-llvmcpu-target-cpu-features=host",
-                "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
-                "--iree-llvmcpu-distribution-size=32",
-                "--iree-opt-const-eval=false",
-                "--iree-llvmcpu-enable-ukernels=all",
-                "--iree-global-opt-enable-quantized-matmul-reassociation",
-            ]
-        )
+        if target_triple == "znver4":
+            flags.extend(znver4_flags["all"])
+            if winograd:
+                flags.extend(znver4_flags["winograd"])
+        else:
+            flags.extend(
+                [
+                    "--iree-llvmcpu-target-triple=" + target_triple,
+                    "--iree-llvmcpu-target-cpu-features=host",
+                    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+                    "--iree-llvmcpu-distribution-size=32",
+                    "--iree-opt-const-eval=false",
+                    "--iree-llvmcpu-enable-ukernels=all",
+                    "--iree-global-opt-enable-quantized-matmul-reassociation",
+                ]
+            )
         device = "llvm-cpu"
     elif device in ["vulkan", "vulkan-spirv"]:
         flags.extend(
@@ -159,9 +179,12 @@ def compile_to_vmfb(
             flags.extend(MI_flags["vae"])
         flags.extend(MI_flags["all"])
 
-    if target_triple in ["gfx1100", "gfx1103", "gfx1150"]:
+    if "gfx11" in target_triple:
         flags.extend(GFX11_flags["all"])
 
+    # for now, these devices don't play well with external weights, so we assume
+    # that the model has inlined params and benefits from const-eval.
+    # otherwise, disable it since we should have external weights.
     if target_triple not in ["gfx1103", "gfx1150"]:
         flags.extend(["--iree-opt-const-eval=false"])
 
@@ -172,7 +195,7 @@ def compile_to_vmfb(
     if attn_spec in ["default", "mfma"]:
         attn_spec = get_mfma_spec_path(target_triple, os.path.dirname(safe_name))
         flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
-    elif attn_spec in ["wmma"]:
+    elif attn_spec in ["wmma"] or "gfx11" in target_triple:
         attn_spec = get_wmma_spec_path(target_triple, os.path.dirname(safe_name))
         if attn_spec:
             flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])

From 76c9f2398c1418a923f1608590f60312eee9ecc6 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sun, 2 Jun 2024 11:27:55 -0500
Subject: [PATCH 103/174] Disable consteval.

---
 .../turbine_models/custom_models/sd_inference/utils.py   | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index e0ba69fea..382a80fd9 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -15,6 +15,7 @@
 MI_flags = {
     "all": [
         "--iree-global-opt-propagate-transposes=true",
+        "--iree-opt-const-eval=false",
         "--iree-opt-outer-dim-concat=true",
         "--iree-vm-target-truncate-unsupported-floats",
         "--iree-llvmgpu-enable-prefetch=true",
@@ -48,6 +49,7 @@
         "--iree-vm-target-truncate-unsupported-floats",
         "--iree-llvmgpu-enable-prefetch=true",
         "--iree-opt-data-tiling=false",
+        "--iree-opt-const-eval=false",
         "--iree-opt-aggressively-propagate-transposes=true",
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
@@ -65,6 +67,7 @@
     "all": [
         "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-global-opt-demote-contraction-inputs-to-bf16))",
         "--iree-llvmcpu-target-cpu=znver4",
+        "--iree-opt-const-eval=false",
         "--iree-llvmcpu-enable-ukernels=mmt4d,pack,unpack",
         "--iree-flow-collapse-reduction-dims",
         "--iree-opt-const-expr-max-size-increase-threshold=1000000000000000",
@@ -182,12 +185,6 @@ def compile_to_vmfb(
     if "gfx11" in target_triple:
         flags.extend(GFX11_flags["all"])
 
-    # for now, these devices don't play well with external weights, so we assume
-    # that the model has inlined params and benefits from const-eval.
-    # otherwise, disable it since we should have external weights.
-    if target_triple not in ["gfx1103", "gfx1150"]:
-        flags.extend(["--iree-opt-const-eval=false"])
-
     # Currently, we need a transform dialect script to be applied to the compilation through IREE in certain cases.
     # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
     # This is a temporary solution, and should be removed or largely disabled once the functionality of

From 5bb12ba95f8723027e1d840873505bae5aec0ec7 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Tue, 4 Jun 2024 19:45:20 -0500
Subject: [PATCH 104/174] QOL fixes to unet scripts.

---
 .../custom_models/sdxl_inference/unet.py      |  7 +++--
 .../sdxl_inference/unet_runner.py             | 31 ++++++++++++++-----
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index ef3db6212..e756c8967 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -239,6 +239,7 @@ def main(
         args.hf_model_name,
         f"_bs{args.batch_size}_{args.max_length}_{args.height}x{args.width}_{args.precision}_unet",
     )
-    with open(f"{safe_name}.mlir", "w+") as f:
-        f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")
+    if args.compile_to != "vmfb":
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index 60cc206f1..0dec00a61 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -112,15 +112,30 @@ def run_torch_unet(
         dtype = torch.float16
     else:
         dtype = torch.float32
+
+    save_inputs = True
+
     sample = torch.rand(
         args.batch_size, 4, args.height // 8, args.width // 8, dtype=dtype
     )
-    timestep = torch.zeros(1, dtype=torch.int64)
+    timestep = torch.ones(1, dtype=torch.int64)
     prompt_embeds = torch.rand(2 * args.batch_size, args.max_length, 2048, dtype=dtype)
     text_embeds = torch.rand(2 * args.batch_size, 1280, dtype=dtype)
-    time_ids = torch.zeros(2 * args.batch_size, 6, dtype=dtype)
+    time_ids = torch.rand(2 * args.batch_size, 6, dtype=dtype)
     guidance_scale = torch.tensor([7.5], dtype=dtype)
 
+    if save_inputs:
+        import os
+        inputs_dir = "sdxl_unet_inputs_" + args.precision
+        if not os.path.exists(inputs_dir):
+            os.mkdir(inputs_dir)
+        np.save("input1.npy", sample)
+        np.save("input2.npy", timestep)
+        np.save("input3.npy", prompt_embeds)
+        np.save("input4.npy", text_embeds)
+        np.save("input5.npy", time_ids)
+        np.save("input6.npy", guidance_scale)
+
     turbine_output = run_unet(
         args.device,
         sample,
@@ -133,12 +148,12 @@ def run_torch_unet(
         args.hf_model_name,
         args.hf_auth_token,
         args.external_weight_path,
-    )
+    ).to_host()
     print(
         "TURBINE OUTPUT:",
-        turbine_output.to_host(),
-        turbine_output.to_host().shape,
-        turbine_output.to_host().dtype,
+        turbine_output,
+        turbine_output.shape,
+        turbine_output.dtype,
     )
 
     if args.compare_vs_torch:
@@ -158,9 +173,9 @@ def run_torch_unet(
             # precision="fp16",
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
+        if save_inputs:
+            np.save("golden_out.npy", torch_output)
         atol = 4e-2
         rtol = 4e-1
         np.testing.assert_allclose(turbine_output, torch_output, atol=atol, rtol=rtol)
 
-    # TODO: Figure out why we occasionally segfault without unlinking output variables
-    turbine_output = None

From 1993b0c83091aefa3aa8c87331c760a5d69ce93a Mon Sep 17 00:00:00 2001
From: Ian <Ian.Nordeng@amd.com>
Date: Tue, 4 Jun 2024 17:12:54 -0500
Subject: [PATCH 105/174] Adds batch size to prompt encoder and fixes

 - Adds batch size support for prompt encoder output to enable batched text to image support
 - Fixes for pipeline_ir and sdxl_command_pipeline for batch size support
 - Testing 1,2,4,8,12,16 batch sizes. 8 and 16 currently have compile time issues through iree compile, otherwise all works
---
 .../custom_models/sd_inference/utils.py       |  3 +-
 .../sdxl_inference/pipeline_ir.py             | 12 +++----
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 31 +++++++++++++------
 .../sdxl_inference/sdxl_prompt_encoder.py     | 20 ++++++++++--
 4 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 382a80fd9..bc986ae4d 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -55,7 +55,7 @@
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
-        '--iree-codegen-llvmgpu-enable-transform-dialect-jit=false',
+        "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics))",
     ],
     "unet": [""],
@@ -78,6 +78,7 @@
     ],
 }
 
+
 def compile_to_vmfb(
     module_str,
     device,
diff --git a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
index 1bffadfa5..cb2b62bea 100644
--- a/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
+++ b/models/turbine_models/custom_models/sdxl_inference/pipeline_ir.py
@@ -1,11 +1,11 @@
 tokens_to_image = r"""
 module @sdxl_compiled_pipeline {{
   func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>) attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}]"}}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %arg1: tensor<{bd}x{max_length}x2048x{precision}>, %arg2: tensor<{bd}x1280x{precision}>, %arg3: tensor<{bd}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<{batch_size}xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %arg1: tensor<{bd}x{max_length}x2048x{precision}>, %arg2: tensor<{bd}x1280x{precision}>, %arg3: tensor<{bd}x6x{precision}>, %arg4: tensor<1x{precision}>, %arg5: tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
   func.func private @compiled_clip.encode_prompts(%arg0: tensor<{batch_size}x{max_length}xi64>, %arg1: tensor<{batch_size}x{max_length}xi64>, %arg2: tensor<{batch_size}x{max_length}xi64>, %arg3: tensor<{batch_size}x{max_length}xi64>) -> (tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>) attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}]"}}
   func.func private @{vae_fn_name}.main(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
 
-  func.func @tokens_to_image(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>, %t_ids_1: tensor<{batch_size}x{max_length}xi64>, %t_ids_2: tensor<{batch_size}x{max_length}xi64>, %u_ids_1: tensor<{batch_size}x{max_length}xi64>, %u_ids_2: tensor<{batch_size}x{max_length}xi64>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> {{
+  func.func @tokens_to_image(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %guidance_scale: tensor<1x{precision}>, %t_ids_1: tensor<{batch_size}x{max_length}xi64>, %t_ids_2: tensor<{batch_size}x{max_length}xi64>, %u_ids_1: tensor<{batch_size}x{max_length}xi64>, %u_ids_2: tensor<{batch_size}x{max_length}xi64>) -> tensor<{batch_size}x3x{width}x{height}x{precision}> {{
     %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>, tensor<{batch_size}x{max_length}xi64>) -> (tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>)
     %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>)
     %c0 = arith.constant 0 : index
@@ -15,7 +15,7 @@
     %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) {{
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>, tensor<{bd}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>, tensor<{bd}x6x{precision}>, tensor<1x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}>
       scf.yield %inner : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
     }}
     %image = func.call @{vae_fn_name}.main(%res): (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> tensor<{batch_size}x3x{width}x{height}x{precision}>
@@ -27,9 +27,9 @@
 unet_loop = r"""
 module @sdxl_compiled_pipeline {{
   func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>) attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}]"}}
-  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %arg1: tensor<{bd}x{max_length}x2048x{precision}>, %arg2: tensor<{bd}x1280x{precision}>, %arg3: tensor<{bd}x6x{precision}>, %arg4: tensor<{batch_size}x{precision}>, %arg5: tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %arg1: tensor<{bd}x{max_length}x2048x{precision}>, %arg2: tensor<{bd}x1280x{precision}>, %arg3: tensor<{bd}x6x{precision}>, %arg4: tensor<1x{precision}>, %arg5: tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> attributes {{torch.args_schema = "[1, {{\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]}}, {{\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}}]}}]", torch.return_schema = "[1, {{\22type\22: null, \22context\22: null, \22children_spec\22: []}}]"}}
   
-  func.func @produce_image_latents(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %p_embeds: tensor<{bd}x{max_length}x2048x{precision}>, %t_embeds: tensor<{bd}x1280x{precision}>, %guidance_scale: tensor<{batch_size}x{precision}>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> {{
+  func.func @produce_image_latents(%sample: tensor<{batch_size}x4x{lw}x{lh}x{precision}>, %p_embeds: tensor<{bd}x{max_length}x2048x{precision}>, %t_embeds: tensor<{bd}x1280x{precision}>, %guidance_scale: tensor<1x{precision}>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}> {{
     %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x6x{precision}>, tensor<i64>)
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -38,7 +38,7 @@
     %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<{batch_size}x4x{lw}x{lh}x{precision}>) {{
       %step_64 = arith.index_cast %arg0 : index to i64
       %this_step = tensor.from_elements %step_64 : tensor<1xi64>
-      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>, tensor<{bd}x6x{precision}>, tensor<{batch_size}x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<{batch_size}x4x{lw}x{lh}x{precision}>, tensor<{bd}x{max_length}x2048x{precision}>, tensor<{bd}x1280x{precision}>, tensor<{bd}x6x{precision}>, tensor<1x{precision}>, tensor<1xi64>) -> tensor<{batch_size}x4x{lw}x{lh}x{precision}>
       scf.yield %inner : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
     }}
     return %res : tensor<{batch_size}x4x{lw}x{lh}x{precision}>
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index e6612eea2..23af25aa4 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -346,6 +346,7 @@ def export_submodel(
                     input_mlir=input_mlir["prompt_encoder"],
                     attn_spec=self.attn_spec,
                     weights_only=weights_only,
+                    output_batchsize=self.batch_size,
                 )
                 return prompt_encoder_vmfb, prompt_encoder_external_weight_path
             case "pipeline":
@@ -624,13 +625,22 @@ def generate_images(
         for idx, image in enumerate(numpy_images):
             image = torch.from_numpy(image).cpu().permute(0, 2, 3, 1).float().numpy()
             image = numpy_to_pil_image(image)
-            images.append(image[0])
+            images.append(image)
         if return_imgs:
             return images
-        for idx, image in enumerate(images):
-            img_path = "sdxl_output_" + timestamp + "_" + str(idx) + ".png"
-            image.save(img_path)
-            print(img_path, "saved")
+        for idx_batch, image_batch in enumerate(images):
+            for idx, image in enumerate(image_batch):
+                img_path = (
+                    "sdxl_output_"
+                    + timestamp
+                    + "_"
+                    + str(idx_batch)
+                    + "_"
+                    + str(idx)
+                    + ".png"
+                )
+                image.save(img_path)
+                print(img_path, "saved")
         return
 
 
@@ -643,10 +653,14 @@ def numpy_to_pil_image(images):
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
-        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        pil_images = []
+        for batched_image in images:
+            for image in range(0, batched_image.size(dim=0)):
+                pil_images.append(Image.fromarray(image.squeeze(), mode="L"))
     else:
-        pil_images = [Image.fromarray(image) for image in images]
-
+        pil_images = []
+        for image in images:
+            pil_images.append(Image.fromarray(image))
     return pil_images
 
 
@@ -702,7 +716,6 @@ def numpy_to_pil_image(images):
             mlirs[submodel_id] = mlir_path
     if not args.external_weights_dir and args.external_weights:
         args.external_weights_dir = args.pipeline_dir
-
     sdxl_pipe = SharkSDXLPipeline(
         args.hf_model_name,
         args.scheduler_id,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index be962ac5f..3df5607fc 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -24,6 +24,7 @@ def __init__(
         precision,
         hf_auth_token=None,
         do_classifier_free_guidance=True,
+        batch_size=1,
     ):
         super().__init__()
         self.torch_dtype = torch.float16 if precision == "fp16" else torch.float32
@@ -38,6 +39,7 @@ def __init__(
             token=hf_auth_token,
         )
         self.do_classifier_free_guidance = True
+        self.batch_size = batch_size
 
     def forward(
         self, text_input_ids_1, text_input_ids_2, uncond_input_ids_1, uncond_input_ids_2
@@ -76,24 +78,28 @@ def forward(
             neg_prompt_embeds = torch.concat(neg_prompt_embeds_list, dim=-1)
 
             bs_embed, seq_len, _ = prompt_embeds.shape
-
             prompt_embeds = prompt_embeds.repeat(1, 1, 1)
             prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)
             pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(
                 bs_embed * 1, -1
             )
+            prompt_embeds = prompt_embeds.repeat(self.batch_size, 1, 1)
             add_text_embeds = pooled_prompt_embeds
+            add_text_embeds = add_text_embeds.repeat(self.batch_size, 1)
             if self.do_classifier_free_guidance:
                 neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(1, 1).view(
                     1, -1
                 )
                 neg_prompt_embeds = neg_prompt_embeds.repeat(1, 1, 1)
                 neg_prompt_embeds = neg_prompt_embeds.view(bs_embed * 1, seq_len, -1)
+                neg_prompt_embeds = neg_prompt_embeds.repeat(self.batch_size, 1, 1)
                 prompt_embeds = torch.cat([neg_prompt_embeds, prompt_embeds], dim=0)
+                neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(
+                    self.batch_size, 1
+                )
                 add_text_embeds = torch.cat(
                     [neg_pooled_prompt_embeds, add_text_embeds], dim=0
                 )
-
             add_text_embeds = add_text_embeds.to(self.torch_dtype)
             prompt_embeds = prompt_embeds.to(self.torch_dtype)
             return prompt_embeds, add_text_embeds
@@ -129,7 +135,9 @@ def forward_turbo(self, text_input_ids_1, text_input_ids_2):
             pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(
                 bs_embed * 1, -1
             )
+            prompt_embeds = prompt_embeds.repeat(self.batch_size, 1, 1)
             add_text_embeds = pooled_prompt_embeds
+            add_text_embeds = add_text_embeds.repeat(self.batch_size, 1)
 
             add_text_embeds = add_text_embeds.to(self.torch_dtype)
             prompt_embeds = prompt_embeds.to(self.torch_dtype)
@@ -152,6 +160,7 @@ def export_prompt_encoder(
     input_mlir=None,
     attn_spec=None,
     weights_only=False,
+    output_batchsize=1,
 ):
     if "turbo" in hf_model_name:
         do_classifier_free_guidance = False
@@ -191,7 +200,11 @@ def export_prompt_encoder(
     )
     tokenizers = [tokenizer_1, tokenizer_2]
     prompt_encoder_module = PromptEncoderModule(
-        hf_model_name, precision, hf_auth_token, do_classifier_free_guidance
+        hf_model_name,
+        precision,
+        hf_auth_token,
+        do_classifier_free_guidance,
+        batch_size=output_batchsize,
     )
     if precision == "fp16":
         prompt_encoder_module = prompt_encoder_module.half()
@@ -272,6 +285,7 @@ def encode_prompts_turbo(
         pipeline_dir=args.pipeline_dir,
         input_mlir=args.input_mlir,
         attn_spec=args.attn_spec,
+        output_batchsize=args.batch_size,
     )
     if args.input_mlir:
         exit()

From a12686779691d449bd55cbb20341b6668aae8060 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Tue, 4 Jun 2024 19:47:20 -0500
Subject: [PATCH 106/174] formatting

---
 .../turbine_models/custom_models/sdxl_inference/unet_runner.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index 0dec00a61..4437b9eae 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -126,6 +126,7 @@ def run_torch_unet(
 
     if save_inputs:
         import os
+
         inputs_dir = "sdxl_unet_inputs_" + args.precision
         if not os.path.exists(inputs_dir):
             os.mkdir(inputs_dir)
@@ -178,4 +179,3 @@ def run_torch_unet(
         atol = 4e-2
         rtol = 4e-1
         np.testing.assert_allclose(turbine_output, torch_output, atol=atol, rtol=rtol)
-

From b2620813b1d1241754369d356113f30e650c2e16 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 4 Jun 2024 23:07:26 -0500
Subject: [PATCH 107/174] Fix resnet test.

---
 models/turbine_models/tests/resnet_test.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/models/turbine_models/tests/resnet_test.py b/models/turbine_models/tests/resnet_test.py
index 5d514e6fe..2a34de51f 100644
--- a/models/turbine_models/tests/resnet_test.py
+++ b/models/turbine_models/tests/resnet_test.py
@@ -18,17 +18,12 @@
 
 
 class Resnet18Test(unittest.TestCase):
-    @pytest.mark.xfail(
-        reason="caused by lack of support for DenseResourceElementsAttr iteration over a generic FloatAttr"
-    )
     def testExportResnet18Model(self):
-        with self.assertRaises(SystemExit) as cm:
-            resnet_18.export_resnet_18_model(
-                resnet_model,
-                "vmfb",
-                "cpu",
-            )
-        self.assertEqual(cm.exception.code, None)
+        resnet_18.export_resnet_18_model(
+            resnet_model,
+            "vmfb",
+            "cpu",
+        )
         namespace = argparse.Namespace(**arguments)
         resnet_18.run_resnet_18_vmfb_comparison(resnet_model, namespace)
         os.remove("resnet_18.vmfb")

From 036179ef94e355464387430643457323f1c30723 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 5 Jun 2024 00:22:41 -0500
Subject: [PATCH 108/174] Add a very simple gfx1100 resnet test.

---
 .gitignore                                    |  1 +
 .../turbine_models/custom_models/resnet_18.py | 25 +++++++-
 models/turbine_models/tests/resnet_test.py    | 63 +++++++++++++++----
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index d85c8598b..f5fe49941 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ wheelhouse
 *.safetensors
 *.gguf
 *.vmfb
+*.mlir
\ No newline at end of file
diff --git a/models/turbine_models/custom_models/resnet_18.py b/models/turbine_models/custom_models/resnet_18.py
index c2321f49a..010ed523c 100644
--- a/models/turbine_models/custom_models/resnet_18.py
+++ b/models/turbine_models/custom_models/resnet_18.py
@@ -68,8 +68,28 @@ def main(self, x=AbstractTensor(None, 3, 224, 224, dtype=torch.float32)):
     else:
         utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, "resnet_18")
 
+def export_static_resnet_18_model(
+    resnet_model, compile_to="torch", device=None, target_triple=None, max_alloc=None
+):
+    resnet_model = resnet_model.half()
+    class CompiledResnet18Model(CompiledModule):
+        params = export_parameters(resnet_model.model)
+
+        def main(self, x=AbstractTensor(5, 3, 224, 224, dtype=torch.float16)):
+            return jittable(resnet_model.forward)(x)
+
+    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
+    inst = CompiledResnet18Model(context=Context(), import_to=import_to)
+
+    module_str = str(CompiledModule.get_mlir_module(inst))
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, "resnet_18")
+
 
 def run_resnet_18_vmfb_comparison(resnet_model, args):
+    torch_dtype = torch.float32 if args.precision == "fp32" else torch.float16
     config = rt.Config(args.device)
 
     if args.vmfb_path:
@@ -87,7 +107,7 @@ def run_resnet_18_vmfb_comparison(resnet_model, args):
         vm_modules=vm_modules,
         config=config,
     )
-    inp = torch.rand(5, 3, 224, 224, dtype=torch.float32)
+    inp = torch.rand(5, 3, 224, 224, dtype=torch_dtype)
     device_inputs = [rt.asdevicearray(config.device, inp)]
 
     # Turbine output
@@ -107,7 +127,8 @@ def run_resnet_18_vmfb_comparison(resnet_model, args):
 
     err = utils.largest_error(torch_output, turbine_output)
     print("LARGEST ERROR:", err)
-    assert err < 9e-5
+    del CompModule
+    return err
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/tests/resnet_test.py b/models/turbine_models/tests/resnet_test.py
index 2a34de51f..efce70299 100644
--- a/models/turbine_models/tests/resnet_test.py
+++ b/models/turbine_models/tests/resnet_test.py
@@ -5,30 +5,67 @@
 import os
 import pytest
 
-arguments = {
-    "run_vmfb": True,
-    "compile_to": None,
-    "vmfb_path": "",
-    "device": "local-task",
-    "iree_target_triple": "",
-    "vulkan_max_allocation": "4294967296",
-}
-
 resnet_model = resnet_18.Resnet18Model()
 
 
 class Resnet18Test(unittest.TestCase):
-    def testExportResnet18Model(self):
+    def testExportResnet18ModelCPU(self):
+        from turbine_models.tests.testing_cmd_opts import args
+        arguments = {
+            "run_vmfb": True,
+            "compile_to": "vmfb",
+            "vmfb_path": "",
+            "device": "local-task",
+            "target_triple": "x86_64-unknown-linux-gnu",
+            "vulkan_max_allocation": "4294967296",
+            "precision": "fp32",
+        }
         resnet_18.export_resnet_18_model(
             resnet_model,
             "vmfb",
             "cpu",
         )
-        namespace = argparse.Namespace(**arguments)
-        resnet_18.run_resnet_18_vmfb_comparison(resnet_model, namespace)
-        os.remove("resnet_18.vmfb")
+        namespace = AttributeDict(arguments)
+        err = resnet_18.run_resnet_18_vmfb_comparison(resnet_model, namespace)
+        assert err < 1e-5
+        
+    def testExportResnet18ModelStaticGFX1100(self):
+        from turbine_models.tests.testing_cmd_opts import args
+        arguments = {
+            "run_vmfb": True,
+            "compile_to": "vmfb",
+            "vmfb_path": "",
+            "device": "rocm",
+            "target_triple": "gfx1100",
+            "vulkan_max_allocation": "4294967296",
+            "precision": "fp16",
+        }
+        resnet_18.export_static_resnet_18_model(
+            resnet_model,
+            "vmfb",
+            "rocm",
+            arguments["target_triple"],
+        )
+        namespace = AttributeDict(arguments)
+        rocm_err = resnet_18.run_resnet_18_vmfb_comparison(resnet_model, namespace)
+        namespace.device = "hip"
+        hip_err = resnet_18.run_resnet_18_vmfb_comparison(resnet_model, namespace)
+        print("ROCM ERROR:", rocm_err)
+        print("HIP ERROR:", hip_err)
+        assert rocm_err < 1e-5
+        assert hip_err < 1e-5
+
 
+class AttributeDict(dict):
+    def __getattr__(self, attr):
+        return self[attr]
+    def __setattr__(self, attr, value):
+        self[attr] = value
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.DEBUG)
     unittest.main()
+    if os.path.exists("resnet_18.mlir"):
+        os.remove("resnet_18.mlir")
+    if os.path.exists("resnet_18.vmfb"):
+        os.remove("resnet_18.vmfb")

From 81181fadf74c5a5d2944c81d51fb34d203ac5c0b Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 5 Jun 2024 02:11:55 -0500
Subject: [PATCH 109/174] A few more resnet fixups for testing hip numerics,
 add winograd flags to znver4 compile flags

---
 models/turbine_models/custom_models/resnet_18.py       |  5 ++++-
 .../turbine_models/custom_models/sd_inference/utils.py |  2 --
 models/turbine_models/tests/resnet_test.py             | 10 ++++++----
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/models/turbine_models/custom_models/resnet_18.py b/models/turbine_models/custom_models/resnet_18.py
index 010ed523c..dd36e4578 100644
--- a/models/turbine_models/custom_models/resnet_18.py
+++ b/models/turbine_models/custom_models/resnet_18.py
@@ -32,7 +32,7 @@
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 
 # TODO: Add other resnet models
-
+torch.random.manual_seed(0)
 
 class Resnet18Model(torch.nn.Module):
     def __init__(self):
@@ -89,6 +89,7 @@ def main(self, x=AbstractTensor(5, 3, 224, 224, dtype=torch.float16)):
 
 
 def run_resnet_18_vmfb_comparison(resnet_model, args):
+    import numpy as np
     torch_dtype = torch.float32 if args.precision == "fp32" else torch.float16
     config = rt.Config(args.device)
 
@@ -108,6 +109,7 @@ def run_resnet_18_vmfb_comparison(resnet_model, args):
         config=config,
     )
     inp = torch.rand(5, 3, 224, 224, dtype=torch_dtype)
+    np.save(f"test_input_{args.precision}.npy", inp.numpy())
     device_inputs = [rt.asdevicearray(config.device, inp)]
 
     # Turbine output
@@ -124,6 +126,7 @@ def run_resnet_18_vmfb_comparison(resnet_model, args):
     torch_output = resnet_model.forward(inp)
     torch_output = torch_output.detach().cpu().numpy()
     print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
+    np.save(f"resnet18_golden_out.npy", torch_output)
 
     err = utils.largest_error(torch_output, turbine_output)
     print("LARGEST ERROR:", err)
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index bc986ae4d..2f0054920 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -71,8 +71,6 @@
         "--iree-llvmcpu-enable-ukernels=mmt4d,pack,unpack",
         "--iree-flow-collapse-reduction-dims",
         "--iree-opt-const-expr-max-size-increase-threshold=1000000000000000",
-    ],
-    "winograd": [
         "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))",
         "--iree-flow-enable-fuse-padding-into-linalg-consumer-ops",
     ],
diff --git a/models/turbine_models/tests/resnet_test.py b/models/turbine_models/tests/resnet_test.py
index efce70299..a7349eb49 100644
--- a/models/turbine_models/tests/resnet_test.py
+++ b/models/turbine_models/tests/resnet_test.py
@@ -55,6 +55,12 @@ def testExportResnet18ModelStaticGFX1100(self):
         assert rocm_err < 1e-5
         assert hip_err < 1e-5
 
+    # def tearDown(self):
+    #     if os.path.exists("resnet_18.vmfb"):
+    #         os.remove("resnet_18.vmfb")
+    #     if os.path.exists("resnet_18.mlir"):
+    #         os.remove("resnet_18.mlir")
+
 
 class AttributeDict(dict):
     def __getattr__(self, attr):
@@ -65,7 +71,3 @@ def __setattr__(self, attr, value):
 if __name__ == "__main__":
     logging.basicConfig(level=logging.DEBUG)
     unittest.main()
-    if os.path.exists("resnet_18.mlir"):
-        os.remove("resnet_18.mlir")
-    if os.path.exists("resnet_18.vmfb"):
-        os.remove("resnet_18.vmfb")

From 0d1a4b91bbc841335df094911ab65651399ad167 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 5 Jun 2024 13:57:26 -0500
Subject: [PATCH 110/174] Updates to scheduled unet runner

---
 .../sdxl_scheduled_unet_runner.py             | 205 +++++++++---------
 1 file changed, 108 insertions(+), 97 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
index cc0c9791c..93ce69b43 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
@@ -8,42 +8,6 @@
 
 torch.random.manual_seed(0)
 
-
-def run_unet_hybrid(
-    sample,
-    prompt_embeds,
-    text_embeds,
-    args,
-):
-    runner = vmfbRunner(args.device, args.vmfb_path, args.external_weight_path)
-    init_inp = [
-        ireert.asdevicearray(runner.config.device, sample),
-    ]
-    sample, time_ids, steps = runner.ctx.modules.compiled_scheduled_unet[
-        "run_initialize"
-    ](
-        *init_inp,
-    )
-    dtype = "float16" if args.precision == "fp16" else "float32"
-    inputs = [
-        sample,
-        ireert.asdevicearray(runner.config.device, prompt_embeds),
-        ireert.asdevicearray(runner.config.device, text_embeds),
-        time_ids,
-        ireert.asdevicearray(
-            runner.config.device, np.asarray([args.guidance_scale]), dtype=dtype
-        ),
-        None,
-    ]
-    for i in range(steps.to_host()):
-        inputs[0] = sample
-        inputs[5] = ireert.asdevicearray(
-            runner.config.device, torch.tensor([i]), dtype="int64"
-        )
-        sample = runner.ctx.modules.compiled_scheduled_unet["run_forward"](*inputs)
-    return sample
-
-
 def run_torch_scheduled_unet(
     sample,
     prompt_embeds,
@@ -77,7 +41,7 @@ def run_torch_scheduled_unet(
     return sample
 
 
-def run_scheduled_unet(
+def run_scheduled_unet_compiled(
     sample,
     prompt_embeds,
     text_embeds,
@@ -104,6 +68,90 @@ def run_scheduled_unet(
 
     return latents
 
+def run_scheduled_unet_python(
+    sample,
+    prompt_embeds,
+    text_embeds,
+    args,
+):
+    unet_runner = vmfbRunner(
+        args.device,
+        args.vmfb_path,
+        args.external_weight_path,
+    )
+    dtype = "float16" if args.precision == "fp16" else "float32"
+    sample, time_ids, steps = run_scheduled_unet_initialize(
+        sample,
+        unet_runner,
+        args,
+    )
+    iree_inputs = [
+        sample,
+        ireert.asdevicearray(unet_runner.config.device, prompt_embeds),
+        ireert.asdevicearray(unet_runner.config.device, text_embeds),
+        time_ids,
+        ireert.asdevicearray(
+            unet_runner.config.device, np.asarray([args.guidance_scale]), dtype=dtype
+        ),
+        None,
+    ]
+    for i in range(steps.to_host()):
+        iree_inputs[0] = sample
+        iree_inputs[5] = ireert.asdevicearray(
+            unet_runner.config.device, torch.tensor([i]), dtype="int64"
+        )
+        sample = run_scheduled_unet_forward(
+            sample,
+            prompt_embeds,
+            text_embeds,
+            time_ids,
+            args.guidance_scale,
+            i,
+            unet_runner,
+            args,
+        )
+    return sample
+
+def run_scheduled_unet_initialize(
+    sample,
+    unet_runner,
+    args,
+):
+    dtype = "float16" if args.precision == "fp16" else "float32"
+    inputs = [
+        ireert.asdevicearray(unet_runner.config.device, sample),
+    ]
+    sample, time_ids, steps = unet_runner.ctx.modules.compiled_scheduled_unet["run_initialize"](
+        *inputs,
+    )
+    return sample, time_ids, steps
+
+def run_scheduled_unet_forward(
+    sample,
+    prompt_embeds,
+    text_embeds,
+    time_ids,
+    guidance_scale,
+    timestep,
+    unet_runner,
+    args,
+):
+    dtype = "float16" if args.precision == "fp16" else "float32"
+    inputs = [
+        ireert.asdevicearray(unet_runner.config.device, sample, dtype=dtype),
+        ireert.asdevicearray(unet_runner.config.device, prompt_embeds, dtype=dtype),
+        ireert.asdevicearray(unet_runner.config.device, text_embeds, dtype=dtype),
+        time_ids,
+        ireert.asdevicearray(
+            unet_runner.config.device, np.asarray([guidance_scale]), dtype=dtype
+        ),
+        ireert.asdevicearray(
+            unet_runner.config.device, np.asarray([timestep]), dtype="int64"
+        ),
+    ]
+    sample = unet_runner.ctx.modules.compiled_scheduled_unet["run_forward"](*inputs)
+    return sample
+
 
 def run_torch_diffusers_loop(
     sample,
@@ -166,10 +214,7 @@ def run_torch_diffusers_loop(
         dtype = torch.float16
     else:
         dtype = torch.float32
-    # if "turbo" in args.hf_model_name:
-    #     init_batch_dim = 1
-    # else:
-    #     init_batch_dim = 2
+
     init_batch_dim = 2
     sample = torch.rand(
         args.batch_size, 4, args.height // 8, args.width // 8, dtype=dtype
@@ -181,29 +226,35 @@ def run_torch_diffusers_loop(
     text_embeds = torch.rand(init_batch_dim * args.batch_size, 1280, dtype=dtype)
     time_ids = torch.rand(init_batch_dim * args.batch_size, 6)
 
-    turbine_output = run_scheduled_unet(
+    turbine_python_output = run_scheduled_unet_python(
         sample,
         prompt_embeds,
         text_embeds,
         args,
+    ).to_host()
+    print(
+        "TURBINE PYTHON OUTPUT:",
+        turbine_python_output,
+        turbine_python_output.shape,
+        turbine_python_output.dtype,
     )
+    turbine_compiled_output = run_scheduled_unet_compiled(
+        sample,
+        prompt_embeds,
+        text_embeds,
+        args,
+    ).to_host()
     print(
-        "TURBINE OUTPUT:",
-        turbine_output.to_host(),
-        turbine_output.to_host().shape,
-        turbine_output.to_host().dtype,
+        "TURBINE COMPILED OUTPUT:",
+        turbine_compiled_output,
+        turbine_compiled_output.shape,
+        turbine_compiled_output.dtype,
     )
 
+
     if args.compare_vs_torch:
         from turbine_models.custom_models.sd_inference import utils
 
-        print("generating output with python/torch scheduling unet: ")
-        hybrid_output = run_unet_hybrid(
-            sample,
-            prompt_embeds,
-            text_embeds,
-            args,
-        )
         print("generating torch output: ")
         torch_output = run_torch_scheduled_unet(
             sample,
@@ -211,59 +262,19 @@ def run_torch_diffusers_loop(
             text_embeds,
             args,
         )
-        print("generating torch+diffusers output: ")
-        diff_output = run_torch_diffusers_loop(
-            sample,
-            prompt_embeds,
-            text_embeds,
-            args,
-        )
-        print(
-            "diffusers-like OUTPUT:", diff_output, diff_output.shape, diff_output.dtype
-        )
         print("torch OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
 
-        print(
-            "HYBRID OUTPUT:",
-            hybrid_output.to_host(),
-            hybrid_output.to_host().shape,
-            hybrid_output.to_host().dtype,
-        )
-        print("Comparing... \n(turbine pipelined unet to torch unet): ")
-        try:
-            np.testing.assert_allclose(
-                turbine_output, torch_output, rtol=4e-2, atol=4e-2
-            )
-        except AssertionError as err:
-            print(err)
-        print("\n(turbine pipelined unet to hybrid unet): ")
-        try:
-            np.testing.assert_allclose(
-                hybrid_output, turbine_output, rtol=4e-2, atol=4e-2
-            )
-            print("passed!")
-        except AssertionError as err:
-            print(err)
-        print("\n(hybrid unet to diff unet): ")
-        try:
-            np.testing.assert_allclose(diff_output, hybrid_output, rtol=4e-2, atol=4e-2)
-            print("passed!")
-        except AssertionError as err:
-            print(err)
-        print("\n(turbine loop to diffusers loop): ")
+
         try:
             np.testing.assert_allclose(
-                turbine_output, diff_output, rtol=4e-2, atol=4e-2
+                turbine_compiled_output, torch_output, rtol=4e-2, atol=4e-2
             )
             print("passed!")
         except AssertionError as err:
             print(err)
-        print("\n(torch sched unet loop to diffusers loop): ")
+        print("\n(torch sched unet loop to iree python loop): ")
         try:
-            np.testing.assert_allclose(torch_output, diff_output, rtol=4e-2, atol=4e-2)
+            np.testing.assert_allclose(turbine_python_output, torch_output, rtol=4e-2, atol=4e-2)
             print("passed!")
         except AssertionError as err:
             print(err)
-
-    # TODO: Figure out why we occasionally segfault without unlinking output variables
-    turbine_output = None

From df896a689480fa420161823cb3f849874fa4cda9 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 5 Jun 2024 15:04:12 -0500
Subject: [PATCH 111/174] More sched unet runner fixes, remove bf16/winograd
 flags from default znver4 flags

---
 .../custom_models/sd_inference/utils.py       |  3 +-
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  2 +-
 .../sdxl_scheduled_unet_runner.py             | 47 ++++++++++---------
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 2f0054920..1bbc3f973 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -65,13 +65,12 @@
 }
 znver4_flags = {
     "all": [
-        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-global-opt-demote-contraction-inputs-to-bf16))",
+        #"--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))",
         "--iree-llvmcpu-target-cpu=znver4",
         "--iree-opt-const-eval=false",
         "--iree-llvmcpu-enable-ukernels=mmt4d,pack,unpack",
         "--iree-flow-collapse-reduction-dims",
         "--iree-opt-const-expr-max-size-increase-threshold=1000000000000000",
-        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))",
         "--iree-flow-enable-fuse-padding-into-linalg-consumer-ops",
     ],
 }
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 23af25aa4..4cea924e0 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Nod Labs, Inc
+# Copyright 2024 Advanced Micro Devices, inc.
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
index 93ce69b43..ba91ea673 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
@@ -185,8 +185,8 @@ def run_torch_diffusers_loop(
     prompt_embeds = prompt_embeds.to(torch.float32)
     text_embeds = text_embeds.to(torch.float32)
 
-    for i in range(args.num_inference_steps):
-        timestep = scheduler.timesteps[i]
+    for idx, i in enumerate(scheduler.timesteps):
+        timestep = i
 
         latent_model_input = scheduler.scale_model_input(sample, timestep)
         noise_pred = unet_model.forward(
@@ -225,6 +225,20 @@ def run_torch_diffusers_loop(
     )
     text_embeds = torch.rand(init_batch_dim * args.batch_size, 1280, dtype=dtype)
     time_ids = torch.rand(init_batch_dim * args.batch_size, 6)
+    if args.compiled_pipeline:
+        assert args.pipeline_vmfb_path is not None, "--pipeline_vmfb_path is required for compiled pipeline run"
+        turbine_compiled_output = run_scheduled_unet_compiled(
+            sample,
+            prompt_embeds,
+            text_embeds,
+            args,
+        ).to_host()
+        print(
+            "TURBINE COMPILED OUTPUT:",
+            turbine_compiled_output,
+            turbine_compiled_output.shape,
+            turbine_compiled_output.dtype,
+        )
 
     turbine_python_output = run_scheduled_unet_python(
         sample,
@@ -238,18 +252,7 @@ def run_torch_diffusers_loop(
         turbine_python_output.shape,
         turbine_python_output.dtype,
     )
-    turbine_compiled_output = run_scheduled_unet_compiled(
-        sample,
-        prompt_embeds,
-        text_embeds,
-        args,
-    ).to_host()
-    print(
-        "TURBINE COMPILED OUTPUT:",
-        turbine_compiled_output,
-        turbine_compiled_output.shape,
-        turbine_compiled_output.dtype,
-    )
+
 
 
     if args.compare_vs_torch:
@@ -264,17 +267,17 @@ def run_torch_diffusers_loop(
         )
         print("torch OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
 
-
-        try:
-            np.testing.assert_allclose(
-                turbine_compiled_output, torch_output, rtol=4e-2, atol=4e-2
-            )
-            print("passed!")
-        except AssertionError as err:
-            print(err)
         print("\n(torch sched unet loop to iree python loop): ")
         try:
             np.testing.assert_allclose(turbine_python_output, torch_output, rtol=4e-2, atol=4e-2)
             print("passed!")
         except AssertionError as err:
             print(err)
+
+        if args.compiled_pipeline:
+            print("\n(torch sched unet loop to iree compiled loop): ")
+            try:
+                np.testing.assert_allclose(turbine_compiled_output, torch_output, rtol=4e-2, atol=4e-2)
+                print("passed!")
+            except AssertionError as err:
+                print(err)

From 7e250c73ceb053e08531f74ed49580509046d955 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 6 Jun 2024 08:45:39 -0500
Subject: [PATCH 112/174] Remove SDE scheduler for now, model fixes

- Removes a scheduler we don't use yet from the schedulers init
- Updates static resnet example to use aot.export API
- Simplifies and fixes scheduled unet runner
---
 .../turbine_models/custom_models/resnet_18.py | 17 +++-------
 .../custom_models/sd_inference/utils.py       | 10 +++---
 .../sdxl_inference/sdxl_scheduled_unet.py     |  3 ++
 .../sdxl_scheduled_unet_runner.py             | 31 ++-----------------
 .../custom_models/sdxl_inference/unet.py      |  3 ++
 models/turbine_models/tests/resnet_test.py    |  2 --
 6 files changed, 19 insertions(+), 47 deletions(-)

diff --git a/models/turbine_models/custom_models/resnet_18.py b/models/turbine_models/custom_models/resnet_18.py
index dd36e4578..ad61d0247 100644
--- a/models/turbine_models/custom_models/resnet_18.py
+++ b/models/turbine_models/custom_models/resnet_18.py
@@ -8,7 +8,7 @@
 from iree.compiler.ir import Context
 import iree.runtime as rt
 from turbine_models.custom_models.sd_inference import utils
-
+import shark_turbine.ops.iree as ops
 import argparse
 
 parser = argparse.ArgumentParser()
@@ -43,8 +43,7 @@ def __init__(self):
         # self.extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-18")
 
     def forward(self, pixel_values_tensor: torch.Tensor):
-        with torch.no_grad():
-            logits = self.model.forward(pixel_values_tensor).logits
+        logits = self.model.forward(pixel_values_tensor).logits
         predicted_id = torch.argmax(logits, -1)
         return predicted_id
 
@@ -72,16 +71,10 @@ def export_static_resnet_18_model(
     resnet_model, compile_to="torch", device=None, target_triple=None, max_alloc=None
 ):
     resnet_model = resnet_model.half()
-    class CompiledResnet18Model(CompiledModule):
-        params = export_parameters(resnet_model.model)
-
-        def main(self, x=AbstractTensor(5, 3, 224, 224, dtype=torch.float16)):
-            return jittable(resnet_model.forward)(x)
+    input_args = (torch.empty((5, 3, 224, 224), dtype=torch.float16),)
+    exported = export(resnet_model, args=input_args)
 
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledResnet18Model(context=Context(), import_to=import_to)
-
-    module_str = str(CompiledModule.get_mlir_module(inst))
+    module_str = str(exported.mlir_module)
     if compile_to != "vmfb":
         return module_str
     else:
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 1bbc3f973..e8402b0ad 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -8,7 +8,7 @@
     PNDMScheduler,
     EulerDiscreteScheduler,
     EulerAncestralDiscreteScheduler,
-    DPMSolverSDEScheduler,
+    # DPMSolverSDEScheduler,
 )
 
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
@@ -304,8 +304,8 @@ def get_schedulers(model_id):
         model_id,
         subfolder="scheduler",
     )
-    schedulers["DPMSolverSDE"] = DPMSolverSDEScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
+    # schedulers["DPMSolverSDE"] = DPMSolverSDEScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
     return schedulers
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index a4cc008f0..c89a6730a 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -103,10 +103,12 @@ def forward(
                 latent_model_input = torch.cat([sample] * 2)
             else:
                 latent_model_input = sample
+            #ops.iree.trace_tensor(f"latent_model_input_{step_index}", latent_model_input)
 
             latent_model_input = self.scheduler.scale_model_input(
                 latent_model_input, t
             ).type(self.dtype)
+            #ops.iree.trace_tensor(f"latent_model_input_scaled_{step_index}", latent_model_input)
             noise_pred = self.unet.forward(
                 latent_model_input,
                 t,
@@ -115,6 +117,7 @@ def forward(
                 added_cond_kwargs=added_cond_kwargs,
                 return_dict=False,
             )[0]
+            #ops.iree.trace_tensor(f"noise_pred_{step_index}", noise_pred)
 
             if self.do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
index ba91ea673..4f2ae4df1 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
@@ -35,7 +35,7 @@ def run_torch_scheduled_unet(
             prompt_embeds.float(),
             text_embeds.float(),
             add_time_ids.float(),
-            args.guidance_scale,
+            torch.tensor(args.guidance_scale, dtype=torch.float32),
             i,
         )
     return sample
@@ -61,7 +61,6 @@ def run_scheduled_unet_compiled(
             pipe_runner.config.device, np.asarray([args.guidance_scale]), dtype=dtype
         ),
     ]
-    print(inputs)
     latents = pipe_runner.ctx.modules.sdxl_compiled_pipeline["produce_image_latents"](
         *inputs,
     )
@@ -101,12 +100,7 @@ def run_scheduled_unet_python(
             unet_runner.config.device, torch.tensor([i]), dtype="int64"
         )
         sample = run_scheduled_unet_forward(
-            sample,
-            prompt_embeds,
-            text_embeds,
-            time_ids,
-            args.guidance_scale,
-            i,
+            iree_inputs,
             unet_runner,
             args,
         )
@@ -117,7 +111,6 @@ def run_scheduled_unet_initialize(
     unet_runner,
     args,
 ):
-    dtype = "float16" if args.precision == "fp16" else "float32"
     inputs = [
         ireert.asdevicearray(unet_runner.config.device, sample),
     ]
@@ -127,28 +120,10 @@ def run_scheduled_unet_initialize(
     return sample, time_ids, steps
 
 def run_scheduled_unet_forward(
-    sample,
-    prompt_embeds,
-    text_embeds,
-    time_ids,
-    guidance_scale,
-    timestep,
+    inputs,
     unet_runner,
     args,
 ):
-    dtype = "float16" if args.precision == "fp16" else "float32"
-    inputs = [
-        ireert.asdevicearray(unet_runner.config.device, sample, dtype=dtype),
-        ireert.asdevicearray(unet_runner.config.device, prompt_embeds, dtype=dtype),
-        ireert.asdevicearray(unet_runner.config.device, text_embeds, dtype=dtype),
-        time_ids,
-        ireert.asdevicearray(
-            unet_runner.config.device, np.asarray([guidance_scale]), dtype=dtype
-        ),
-        ireert.asdevicearray(
-            unet_runner.config.device, np.asarray([timestep]), dtype="int64"
-        ),
-    ]
     sample = unet_runner.ctx.modules.compiled_scheduled_unet["run_forward"](*inputs)
     return sample
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index e756c8967..af3522ddd 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -159,6 +159,9 @@ def export_unet_model(
     time_ids_shape = (init_batch_dim * batch_size, 6)
     prompt_embeds_shape = (init_batch_dim * batch_size, max_length, 2048)
     text_embeds_shape = (init_batch_dim * batch_size, 1280)
+    timestep_shape = (1,)
+    guidance_scale_shape = (1,)
+    
 
     class CompiledUnet(CompiledModule):
         if external_weights:
diff --git a/models/turbine_models/tests/resnet_test.py b/models/turbine_models/tests/resnet_test.py
index a7349eb49..1a38c2594 100644
--- a/models/turbine_models/tests/resnet_test.py
+++ b/models/turbine_models/tests/resnet_test.py
@@ -3,7 +3,6 @@
 from turbine_models.custom_models import resnet_18
 import unittest
 import os
-import pytest
 
 resnet_model = resnet_18.Resnet18Model()
 
@@ -30,7 +29,6 @@ def testExportResnet18ModelCPU(self):
         assert err < 1e-5
         
     def testExportResnet18ModelStaticGFX1100(self):
-        from turbine_models.tests.testing_cmd_opts import args
         arguments = {
             "run_vmfb": True,
             "compile_to": "vmfb",

From 667282d5f00d5fcc54a900f2e07dcbe61a75d0de Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Fri, 7 Jun 2024 02:51:22 -0500
Subject: [PATCH 113/174] Establish support for split scheduling.

This completes the first push for a "split scheduler" implementation that lets us delegate all
step count and scheduling dynamicism into nice little modules that
compile in a flash, instead of parametrizing unet compile (and weights
if you're inlining) on those dynamic parameters.

For what it's worth, it also marks the first time scheduled unet runner
isn't a complete stinking mess -- now it gives results that match the
standalone unet numerics comparison with torch as a baseline.

Worth mentioning also that sdxl unet, sdxl scheduler unet, and the new
schedulers are all migrated to the FxProgramBuilder export methodology
in this commit.
---
 .../turbine_models/custom_models/resnet_18.py |   3 +
 .../custom_models/sd_inference/schedulers.py  | 229 +++++++++++-------
 .../custom_models/sd_inference/utils.py       |   3 +-
 .../sdxl_inference/sdxl_cmd_opts.py           |  14 ++
 .../sdxl_inference/sdxl_scheduled_unet.py     | 206 +++++++++-------
 .../sdxl_scheduled_unet_runner.py             | 147 ++++++++---
 .../custom_models/sdxl_inference/unet.py      | 104 ++++----
 .../custom_models/stateless_llama.py          |   1 -
 models/turbine_models/tests/resnet_test.py    |   5 +-
 9 files changed, 436 insertions(+), 276 deletions(-)

diff --git a/models/turbine_models/custom_models/resnet_18.py b/models/turbine_models/custom_models/resnet_18.py
index ad61d0247..c1fd59b74 100644
--- a/models/turbine_models/custom_models/resnet_18.py
+++ b/models/turbine_models/custom_models/resnet_18.py
@@ -34,6 +34,7 @@
 # TODO: Add other resnet models
 torch.random.manual_seed(0)
 
+
 class Resnet18Model(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -67,6 +68,7 @@ def main(self, x=AbstractTensor(None, 3, 224, 224, dtype=torch.float32)):
     else:
         utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, "resnet_18")
 
+
 def export_static_resnet_18_model(
     resnet_model, compile_to="torch", device=None, target_triple=None, max_alloc=None
 ):
@@ -83,6 +85,7 @@ def export_static_resnet_18_model(
 
 def run_resnet_18_vmfb_comparison(resnet_model, args):
     import numpy as np
+
     torch_dtype = torch.float32 if args.precision == "fp32" else torch.float16
     config = rt.Config(args.device)
 
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index f0ad8a848..5bef5ae14 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -36,48 +36,77 @@
 
 
 class SharkSchedulerWrapper:
-    def __init__(self, rt_device, vmfb, weights):
-        self.runner = vmfbRunner(rt_device, vmfb, weights)
+    def __init__(self, rt_device, vmfb):
+        self.runner = vmfbRunner(rt_device, vmfb, None)
 
     def initialize(self, sample):
-        return self.runner.ctx.modules.scheduler["initialize"](sample)
+        return self.runner.ctx.modules.compiled_scheduler["run_initialize"](sample)
 
     def scale_model_input(self, sample, t):
-        return self.runner.ctx.modules.scheduler["scale_model_input"](sample, t)
+        return self.runner.ctx.modules.compiled_scheduler["run_scale"](sample, t)
 
-    def step(self, sample, latents, t):
-        return self.runner.ctx.modules.scheduler["step"](sample, latents, t)
+    def step(self, noise_pred, t, sample, guidance_scale, step_index):
+        return self.runner.ctx.modules.compiled_scheduler["run_step"](
+            noise_pred, t, sample, guidance_scale, step_index
+        )
 
 
 class SchedulingModel(torch.nn.Module):
-    def __init__(self, scheduler, height, width, num_inference_steps, dtype):
+    def __init__(
+        self,
+        hf_model_name,
+        scheduler,
+        height,
+        width,
+        batch_size,
+        num_inference_steps,
+        dtype,
+    ):
+        super().__init__()
+        # For now, assumes SDXL implementation. May not need parametrization for other models,
+        # but keeping hf_model_name in case.
         self.model = scheduler
         self.height = height
         self.width = width
+        self.batch_size = batch_size
+        self.do_classifier_free_guidance = True
         self.model.set_timesteps(num_inference_steps)
         self.model.is_scale_input_called = True
         self.dtype = dtype
 
     def initialize(self, sample):
-        height = sample.shape[-2] * 8
-        width = sample.shape[-1] * 8
+        height = self.height
+        width = self.width
         original_size = (height, width)
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
         add_time_ids = list(original_size + crops_coords_top_left + target_size)
-        add_time_ids = torch.tensor([add_time_ids])
-        add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
-        add_time_ids = add_time_ids.repeat(sample.shape[0], 1).type(self.dtype)
+        add_time_ids = torch.tensor([add_time_ids], dtype=self.dtype)
+        if self.do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+            add_time_ids = add_time_ids.repeat(self.batch_size, 1).type(self.dtype)
         timesteps = self.model.timesteps
         step_indexes = torch.tensor(len(timesteps))
         sample = sample * self.model.init_noise_sigma
         return sample.type(self.dtype), add_time_ids, step_indexes
 
-    def scale_model_input(self, sample, t):
-        return self.model.scale_model_input(sample, t)
-
-    def step(self, latents, t, sample):
-        return self.model.step(latents, t, sample)
+    def prepare_model_input(self, sample, t):
+        t = self.model.timesteps[t]
+        if self.do_classifier_free_guidance:
+            latent_model_input = torch.cat([sample] * 2)
+        else:
+            latent_model_input = sample
+        return self.model.scale_model_input(latent_model_input, t), t.type(self.dtype)
+
+    def step(self, noise_pred, t, sample, guidance_scale, i):
+        self.model._step_index = i
+        if self.do_classifier_free_guidance:
+            noise_preds = noise_pred.chunk(2)
+            noise_pred = noise_preds[0] + guidance_scale * (
+                noise_preds[1] - noise_preds[0]
+            )
+        sample = self.model.step(noise_pred, t, sample)[0]
+        return sample.type(self.dtype)
 
 
 class SharkSchedulerCPUWrapper:
@@ -134,24 +163,25 @@ def export_scheduler_model(
     input_mlir: str = None,
     upload_ir=False,
 ):
+    dtype = torch.float16 if precision == "fp16" else torch.float32
     scheduler = get_scheduler(hf_model_name, scheduler_id)
     scheduler_module = SchedulingModel(
-        hf_model_name, scheduler, height, width, num_inference_steps
-    )
-    vmfb_name = (
-        scheduler_id
-        + "_"
-        + f"{height}x{width}"
-        + "_"
-        + precision
-        + "_"
-        + str(num_inference_steps),
-        +"_" + target_triple,
+        hf_model_name, scheduler, height, width, batch_size, num_inference_steps, dtype
     )
+    vmfb_names = [
+        scheduler_id + "Scheduler",
+        f"bs{batch_size}",
+        f"{height}x{width}",
+        precision,
+        str(num_inference_steps),
+        target_triple,
+    ]
+    vmfb_name = "_".join(vmfb_names)
+
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, vmfb_name)
     else:
-        safe_name = utils.create_safe_name(hf_model_name, vmfb_name)
+        safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
 
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -165,10 +195,11 @@ def export_scheduler_model(
         )
         return vmfb_path
 
-    dtype = torch.float16 if precision == "fp16" else torch.float32
-
-    if precision == "fp16":
-        scheduled_unet_model = scheduled_unet_model.half()
+    do_classifier_free_guidance = True
+    if do_classifier_free_guidance:
+        init_batch_dim = 2
+    else:
+        init_batch_dim = 1
 
     sample = (
         batch_size,
@@ -176,30 +207,62 @@ def export_scheduler_model(
         height // 8,
         width // 8,
     )
+    noise_pred_shape = (
+        batch_size * init_batch_dim,
+        4,
+        height // 8,
+        width // 8,
+    )
+    example_init_args = [torch.empty(sample, dtype=dtype)]
+    example_prep_args = [
+        torch.empty(sample, dtype=dtype),
+        torch.empty(1, dtype=torch.int64),
+    ]
+    example_step_args = [
+        torch.empty(noise_pred_shape, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(sample, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(1, dtype=torch.int64),
+    ]
+
+    fxb = FxProgramsBuilder(scheduler_module)
+
+    @fxb.export_program(
+        args=(example_init_args,),
+    )
+    def _initialize(module, sample):
+        return module.initialize(*sample)
+
+    @fxb.export_program(
+        args=(example_prep_args,),
+    )
+    def _scale(module, input):
+        return module.prepare_model_input(*input)
 
-    class CompiledScheduler(CompiledModule):
-        params = export_parameters(scheduled_unet_model)
-
-        def initialize(
-            self,
-            sample=AbstractTensor(*sample, dtype=dtype),
-        ):
-            return jittable(scheduler_module.initialize)(sample)
-
-        def scale_model_input(
-            self,
-            sample=AbstractTensor(*sample, dtype=dtype),
-            t=AbstractTensor(1, dtype=dtype),
-        ):
-            return jittable(scheduler_module.scale_model_input)(sample, t)
-
-        def step(
-            self,
-            sample=AbstractTensor(*sample, dtype=dtype),
-            latents=AbstractTensor(1, dtype=dtype),
-            t=AbstractTensor(1, dtype=dtype),
-        ):
-            return jittable(scheduler_module.step)(sample, latents, t)
+    @fxb.export_program(
+        args=(example_step_args,),
+    )
+    def _step(module, inputs):
+        return module.step(*inputs)
+
+    decomp_list = []
+    # if decomp_attn == True:
+    #     decomp_list.extend(
+    #         [
+    #             torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+    #             torch.ops.aten._scaled_dot_product_flash_attention.default,
+    #         ]
+    #     )
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+
+        class CompiledScheduler(CompiledModule):
+            run_initialize = _initialize
+            run_scale = _scale
+            run_step = _step
 
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
     inst = CompiledScheduler(context=Context(), import_to=import_to)
@@ -222,42 +285,14 @@ def step(
         return vmfb
 
 
-# from shark_turbine.turbine_models.schedulers import export_scheduler_model
-
-
 def get_scheduler(model_id, scheduler_id):
     # TODO: switch over to turbine and run all on GPU
     print(f"\n[LOG] Initializing schedulers from model id: {model_id}")
-    schedulers = {}
-    for sched in SCHEDULER_MAP:
-        schedulers[sched] = SCHEDULER_MAP[sched].from_pretrained(
+    if scheduler_id in SCHEDULER_MAP.keys():
+        scheduler = SCHEDULER_MAP[scheduler_id].from_pretrained(
             model_id, subfolder="scheduler"
         )
-    schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
-        model_id, subfolder="scheduler", algorithm_type="dpmsolver"
-    )
-    schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
-        model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
-    )
-    schedulers[
-        "DPMSolverMultistepKarras"
-    ] = DPMSolverMultistepScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["DPMSolverMultistepKarras"].config.use_karras_sigmas = True
-    schedulers[
-        "DPMSolverMultistepKarras++"
-    ] = DPMSolverMultistepScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-        algorithm_type="dpmsolver++",
-    )
-    schedulers["DPMSolverMultistepKarras++"].config.use_karras_sigmas = True
-    schedulers["DPMSolverSDE"] = DPMSolverSDEScheduler.from_pretrained(
-        model_id, subfolder="scheduler"
-    )
-    return schedulers[scheduler_id]
+    return scheduler
 
 
 SCHEDULER_MAP = {
@@ -273,6 +308,7 @@ def get_scheduler(model_id, scheduler_id):
     "DPMSolverSinglestep": DPMSolverSinglestepScheduler,
     "KDPM2AncestralDiscrete": KDPM2AncestralDiscreteScheduler,
     "HeunDiscrete": HeunDiscreteScheduler,
+    "DPMSolverMultistepKarras": DPMSolverMultistepScheduler,
 }
 
 if __name__ == "__main__":
@@ -293,10 +329,15 @@ def get_scheduler(model_id, scheduler_id):
         exit_on_vmfb=False,
         input_mlir=args.input_mlir,
     )
-    safe_name = utils.create_safe_name(
-        args.hf_model_name,
-        "_" + args.scheduler_id + "_" + str(args.num_inference_steps),
-    )
-    with open(f"{safe_name}.mlir", "w+") as f:
-        f.write(mod_str)
-    print("Saved to", safe_name + ".mlir")
+    vmfb_names = [
+        args.scheduler_id + "Scheduler",
+        f"_bs{args.batch_size}_{args.height}x{args.width}",
+        args.precision,
+        str(args.num_inference_steps),
+        args.iree_target_triple,
+    ]
+    safe_name = "_".join(vmfb_names)
+    if args.compile_to != "vmfb":
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index e8402b0ad..40e8051cd 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -65,7 +65,7 @@
 }
 znver4_flags = {
     "all": [
-        #"--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))",
+        # "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))",
         "--iree-llvmcpu-target-cpu=znver4",
         "--iree-opt-const-eval=false",
         "--iree-llvmcpu-enable-ukernels=mmt4d,pack,unpack",
@@ -136,6 +136,7 @@ def compile_to_vmfb(
                 "--iree-hal-target-backends=rocm",
                 "--iree-rocm-target-chip=" + target_triple,
                 "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                "--iree-flow-inline-constants-max-byte-length=1",
             ]
         )
         if target_triple == "gfx942":
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
index 8921847ad..410aa91af 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
@@ -116,6 +116,20 @@ def is_valid_file(arg):
     help="path to vmfb containing compiled meta-module",
 )
 
+p.add_argument(
+    "--scheduler_vmfb_path",
+    type=str,
+    default="",
+    help="path to vmfb containing compiled scheduler",
+)
+
+p.add_argument(
+    "--split_scheduler",
+    default=False,
+    action="store_true",
+    help="Use a decoupled unet and scheduler for better QOL.",
+)
+
 p.add_argument(
     "--external_weight_file",
     type=str,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index c89a6730a..3a833b131 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -9,20 +9,22 @@
 import copy
 import os
 import sys
+import numpy as np
+
+# os.environ["TORCH_LOGS"] = "+dynamo"
+
+import torch
+import torch._dynamo as dynamo
 
 from iree import runtime as ireert
 from iree.compiler.ir import Context
-import numpy as np
+
 from shark_turbine.aot import *
 import shark_turbine.ops as ops
+
 from turbine_models.custom_models.sd_inference import utils
 from turbine_models.custom_models.sd_inference.schedulers import get_scheduler
-import torch
-import torch._dynamo as dynamo
 from diffusers import UNet2DConditionModel
-from shark_turbine.dynamo.passes import (
-    DEFAULT_DECOMPOSITIONS,
-)
 
 
 class SDXLScheduledUnet(torch.nn.Module):
@@ -49,6 +51,9 @@ def __init__(
         self.scheduler.set_timesteps(num_inference_steps)
         self.scheduler.is_scale_input_called = True
         self.return_index = return_index
+        self.height = height
+        self.width = width
+        self.batch_size = batch_size
 
         if precision == "fp16":
             try:
@@ -75,8 +80,8 @@ def __init__(
             )
 
     def initialize(self, sample):
-        height = sample.shape[-2] * 8
-        width = sample.shape[-1] * 8
+        height = self.height
+        width = self.width
         original_size = (height, width)
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
@@ -84,7 +89,7 @@ def initialize(self, sample):
         add_time_ids = torch.tensor([add_time_ids], dtype=self.dtype)
         if self.do_classifier_free_guidance:
             add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
-            add_time_ids = add_time_ids.repeat(sample.shape[0], 1).type(self.dtype)
+            add_time_ids = add_time_ids.repeat(self.batch_size, 1).type(self.dtype)
         timesteps = self.scheduler.timesteps
         step_indexes = torch.tensor(len(timesteps))
         sample = sample * self.scheduler.init_noise_sigma
@@ -93,41 +98,50 @@ def initialize(self, sample):
     def forward(
         self, sample, prompt_embeds, text_embeds, time_ids, guidance_scale, step_index
     ):
-        with torch.no_grad():
-            added_cond_kwargs = {
-                "time_ids": time_ids,
-                "text_embeds": text_embeds,
-            }
-            t = self.scheduler.timesteps[step_index]
-            if self.do_classifier_free_guidance:
-                latent_model_input = torch.cat([sample] * 2)
-            else:
-                latent_model_input = sample
-            #ops.iree.trace_tensor(f"latent_model_input_{step_index}", latent_model_input)
-
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t
-            ).type(self.dtype)
-            #ops.iree.trace_tensor(f"latent_model_input_scaled_{step_index}", latent_model_input)
-            noise_pred = self.unet.forward(
-                latent_model_input,
-                t,
-                encoder_hidden_states=prompt_embeds,
-                cross_attention_kwargs=None,
-                added_cond_kwargs=added_cond_kwargs,
-                return_dict=False,
-            )[0]
-            #ops.iree.trace_tensor(f"noise_pred_{step_index}", noise_pred)
-
-            if self.do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
-            sample = self.scheduler.step(noise_pred, t, sample, return_dict=False)[0]
-            return sample.type(self.dtype)
+        added_cond_kwargs = {
+            "time_ids": time_ids,
+            "text_embeds": text_embeds,
+        }
+        t = self.scheduler.timesteps[step_index]
+        if self.do_classifier_free_guidance:
+            latent_model_input = torch.cat([sample] * 2)
+        else:
+            latent_model_input = sample
+        # ops.iree.trace_tensor(f"latent_model_input_{step_index}", latent_model_input)
+
+        latent_model_input = self.scheduler.scale_model_input(
+            latent_model_input, t
+        ).type(self.dtype)
+        print(
+            latent_model_input.shape,
+            t.shape,
+            sample.shape,
+            prompt_embeds.shape,
+            added_cond_kwargs,
+            guidance_scale,
+            step_index,
+        )
+        # ops.iree.trace_tensor(f"latent_model_input_scaled_{step_index}", latent_model_input)
+        noise_pred = self.unet.forward(
+            latent_model_input,
+            t,
+            encoder_hidden_states=prompt_embeds,
+            cross_attention_kwargs=None,
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )[0]
+        # ops.iree.trace_tensor(f"noise_pred_{step_index}", noise_pred)
+
+        if self.do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        sample = self.scheduler.step(noise_pred, t, sample, return_dict=False)[0]
+        return sample.type(self.dtype)
 
 
+@torch.no_grad()
 def export_scheduled_unet_model(
     scheduled_unet_model,
     scheduler_id,
@@ -180,77 +194,85 @@ def export_scheduled_unet_model(
         )
         return vmfb_path
 
-    mapper = {}
-
-    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
-    if decomp_attn == True:
-        decomp_list.extend(
-            [
-                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-                torch.ops.aten._scaled_dot_product_flash_attention.default,
-            ]
-        )
     dtype = torch.float16 if precision == "fp16" else torch.float32
 
     if precision == "fp16":
         scheduled_unet_model = scheduled_unet_model.half()
 
+    mapper = {}
     utils.save_external_weights(
         mapper, scheduled_unet_model, external_weights, external_weight_path
     )
-
     if weights_only:
         return external_weight_path
 
-    sample = (
+    if do_classifier_free_guidance:
+        init_batch_dim = 2
+    else:
+        init_batch_dim = 1
+
+    sample_shape = [
         batch_size,
         scheduled_unet_model.unet.config.in_channels,
         height // 8,
         width // 8,
+    ]
+    time_ids_shape = [init_batch_dim * batch_size, 6]
+    prompt_embeds_shape = [init_batch_dim * batch_size, max_length, 2048]
+    text_embeds_shape = [init_batch_dim * batch_size, 1280]
+
+    fxb = FxProgramsBuilder(scheduled_unet_model)
+
+    example_init_args = [torch.empty(sample_shape, dtype=dtype)]
+    example_forward_args = [
+        torch.empty(sample_shape, dtype=dtype),
+        torch.empty(prompt_embeds_shape, dtype=dtype),
+        torch.empty(text_embeds_shape, dtype=dtype),
+        torch.empty(time_ids_shape, dtype=dtype),
+        torch.empty(1, dtype=dtype),  # guidance_scale
+        torch.empty(1, dtype=torch.int64),  # timestep
+    ]
+
+    @fxb.export_program(
+        args=(example_init_args,),
     )
-    if do_classifier_free_guidance:
-        init_batch_dim = 2
-    else:
-        init_batch_dim = 1
+    def _initialize(module, sample):
+        return module.initialize(*sample)
+
+    @fxb.export_program(
+        args=(example_forward_args,),
+    )
+    def _forward(
+        module,
+        inputs,
+    ):
+        return module.forward(*inputs)
 
-    time_ids_shape = (init_batch_dim * batch_size, 6)
-    prompt_embeds_shape = (init_batch_dim * batch_size, max_length, 2048)
-    text_embeds_shape = (init_batch_dim * batch_size, 1280)
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list.extend(
+            [
+                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+                torch.ops.aten._scaled_dot_product_flash_attention.default,
+            ]
+        )
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+
+        class CompiledScheduledUnet(CompiledModule):
+            run_initialize = _initialize
+            run_forward = _forward
 
-    class CompiledScheduledUnet(CompiledModule):
         if external_weights:
-            params = export_parameters(
-                scheduled_unet_model,
-                external=True,
-                external_scope="",
-                name_mapper=mapper.get,
-            )
-        else:
-            params = export_parameters(scheduled_unet_model)
-
-        def run_initialize(
-            self,
-            sample=AbstractTensor(*sample, dtype=dtype),
-        ):
-            return jittable(scheduled_unet_model.initialize)(sample)
-
-        def run_forward(
-            self,
-            sample=AbstractTensor(*sample, dtype=dtype),
-            prompt_embeds=AbstractTensor(*prompt_embeds_shape, dtype=dtype),
-            text_embeds=AbstractTensor(*text_embeds_shape, dtype=dtype),
-            time_ids=AbstractTensor(*time_ids_shape, dtype=dtype),
-            guidance_scale=AbstractTensor(1, dtype=dtype),
-            step_index=AbstractTensor(1, dtype=torch.int64),
-        ):
-            return jittable(scheduled_unet_model.forward, decompose_ops=decomp_list)(
-                sample, prompt_embeds, text_embeds, time_ids, guidance_scale, step_index
-            )
+            externalize_module_parameters(scheduled_unet_model)
+        if external_weight_path and len(external_weight_path) > 1:
+            save_module_parameters(external_weight_path, scheduled_unet_model)
 
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledScheduledUnet(context=Context(), import_to=import_to)
+        inst = CompiledScheduledUnet(context=Context(), import_to="IMPORT")
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
+        module_str = str(CompiledModule.get_mlir_module(inst))
 
     if compile_to != "vmfb":
         return module_str
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
index 4f2ae4df1..8d8446ccf 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
@@ -1,6 +1,6 @@
 import argparse
 from turbine_models.model_runner import vmfbRunner
-from turbine_models.custom_models.sd_inference import utils
+from turbine_models.custom_models.sd_inference import utils, schedulers
 from iree import runtime as ireert
 import torch
 import numpy as np
@@ -8,6 +8,7 @@
 
 torch.random.manual_seed(0)
 
+
 def run_torch_scheduled_unet(
     sample,
     prompt_embeds,
@@ -67,6 +68,32 @@ def run_scheduled_unet_compiled(
 
     return latents
 
+
+def run_scheduled_unet_initialize(
+    sample,
+    unet_runner,
+    args,
+):
+    inputs = [
+        ireert.asdevicearray(unet_runner.config.device, sample),
+    ]
+    sample, time_ids, steps = unet_runner.ctx.modules.compiled_scheduled_unet[
+        "run_initialize"
+    ](
+        *inputs,
+    )
+    return sample, time_ids, steps
+
+
+def run_scheduled_unet_forward(
+    inputs,
+    unet_runner,
+    args,
+):
+    sample = unet_runner.ctx.modules.compiled_scheduled_unet["run_forward"](*inputs)
+    return sample
+
+
 def run_scheduled_unet_python(
     sample,
     prompt_embeds,
@@ -106,25 +133,57 @@ def run_scheduled_unet_python(
         )
     return sample
 
-def run_scheduled_unet_initialize(
+
+def run_unet_split_scheduled(
     sample,
-    unet_runner,
+    prompt_embeds,
+    text_embeds,
     args,
 ):
-    inputs = [
-        ireert.asdevicearray(unet_runner.config.device, sample),
-    ]
-    sample, time_ids, steps = unet_runner.ctx.modules.compiled_scheduled_unet["run_initialize"](
-        *inputs,
+    unet_runner = vmfbRunner(
+        args.device,
+        args.vmfb_path,
+        args.external_weight_path,
     )
-    return sample, time_ids, steps
-
-def run_scheduled_unet_forward(
-    inputs,
-    unet_runner,
-    args,
-):
-    sample = unet_runner.ctx.modules.compiled_scheduled_unet["run_forward"](*inputs)
+    scheduler = schedulers.SharkSchedulerWrapper(
+        args.device,
+        args.scheduler_vmfb_path,
+    )
+    dtype = "float16" if args.precision == "fp16" else "float32"
+    guidance_scale = ireert.asdevicearray(
+        scheduler.runner.config.device, np.asarray([args.guidance_scale]), dtype=dtype
+    )
+    sample, time_ids, steps = scheduler.initialize(sample)
+    iree_inputs = [
+        sample,
+        ireert.asdevicearray(unet_runner.config.device, prompt_embeds),
+        ireert.asdevicearray(unet_runner.config.device, text_embeds),
+        time_ids,
+        None,
+    ]
+    for i in range(steps.to_host()):
+        print(f"step {i}")
+        step_index = ireert.asdevicearray(
+            unet_runner.config.device, torch.tensor([i]), dtype="int64"
+        )
+        latents, t = scheduler.scale_model_input(
+            sample,
+            step_index,
+        )
+        noise_pred = unet_runner.ctx.modules.compiled_unet["run_forward"](
+            latents,
+            t,
+            iree_inputs[1],
+            iree_inputs[2],
+            iree_inputs[3],
+        )
+        sample = scheduler.step(
+            noise_pred,
+            t,
+            iree_inputs[0],
+            guidance_scale,
+            step_index,
+        )
     return sample
 
 
@@ -201,7 +260,9 @@ def run_torch_diffusers_loop(
     text_embeds = torch.rand(init_batch_dim * args.batch_size, 1280, dtype=dtype)
     time_ids = torch.rand(init_batch_dim * args.batch_size, 6)
     if args.compiled_pipeline:
-        assert args.pipeline_vmfb_path is not None, "--pipeline_vmfb_path is required for compiled pipeline run"
+        assert (
+            args.pipeline_vmfb_path is not None
+        ), "--pipeline_vmfb_path is required for compiled pipeline run"
         turbine_compiled_output = run_scheduled_unet_compiled(
             sample,
             prompt_embeds,
@@ -214,21 +275,35 @@ def run_torch_diffusers_loop(
             turbine_compiled_output.shape,
             turbine_compiled_output.dtype,
         )
-
-    turbine_python_output = run_scheduled_unet_python(
-        sample,
-        prompt_embeds,
-        text_embeds,
-        args,
-    ).to_host()
-    print(
-        "TURBINE PYTHON OUTPUT:",
-        turbine_python_output,
-        turbine_python_output.shape,
-        turbine_python_output.dtype,
-    )
-
-
+    elif args.split_scheduler:
+        assert (
+            args.scheduler_vmfb_path is not None
+        ), "--scheduler_vmfb_path is required for split scheduler run"
+        turbine_split_output = run_unet_split_scheduled(
+            sample,
+            prompt_embeds,
+            text_embeds,
+            args,
+        ).to_host()
+        print(
+            "TURBINE SPLIT OUTPUT:",
+            turbine_split_output,
+            turbine_split_output.shape,
+            turbine_split_output.dtype,
+        )
+    else:
+        turbine_python_output = run_scheduled_unet_python(
+            sample,
+            prompt_embeds,
+            text_embeds,
+            args,
+        ).to_host()
+        print(
+            "TURBINE PYTHON OUTPUT:",
+            turbine_python_output,
+            turbine_python_output.shape,
+            turbine_python_output.dtype,
+        )
 
     if args.compare_vs_torch:
         from turbine_models.custom_models.sd_inference import utils
@@ -244,7 +319,9 @@ def run_torch_diffusers_loop(
 
         print("\n(torch sched unet loop to iree python loop): ")
         try:
-            np.testing.assert_allclose(turbine_python_output, torch_output, rtol=4e-2, atol=4e-2)
+            np.testing.assert_allclose(
+                turbine_python_output, torch_output, rtol=4e-2, atol=4e-2
+            )
             print("passed!")
         except AssertionError as err:
             print(err)
@@ -252,7 +329,9 @@ def run_torch_diffusers_loop(
         if args.compiled_pipeline:
             print("\n(torch sched unet loop to iree compiled loop): ")
             try:
-                np.testing.assert_allclose(turbine_compiled_output, torch_output, rtol=4e-2, atol=4e-2)
+                np.testing.assert_allclose(
+                    turbine_compiled_output, torch_output, rtol=4e-2, atol=4e-2
+                )
                 print("passed!")
             except AssertionError as err:
                 print(err)
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index af3522ddd..6665b18a7 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -47,39 +47,29 @@ def __init__(self, hf_model_name, hf_auth_token=None, precision="fp32"):
                 auth_token=hf_auth_token,
                 low_cpu_mem_usage=False,
             )
-        if "turbo" in hf_model_name:
-            self.do_classifier_free_guidance = False
-        else:
-            self.do_classifier_free_guidance = True
+        # if "turbo" in hf_model_name:
+        #     self.do_classifier_free_guidance = False
+        # else:
+        self.do_classifier_free_guidance = True
 
-    def forward(
-        self, sample, timestep, prompt_embeds, text_embeds, time_ids, guidance_scale
-    ):
+    def forward(self, latents, timestep, prompt_embeds, text_embeds, time_ids):
         with torch.no_grad():
             added_cond_kwargs = {
                 "text_embeds": text_embeds,
                 "time_ids": time_ids,
             }
-            if self.do_classifier_free_guidance:
-                latent_model_input = torch.cat([sample] * 2)
-            else:
-                latent_model_input = sample
             noise_pred = self.unet.forward(
-                latent_model_input,
+                latents,
                 timestep,
                 encoder_hidden_states=prompt_embeds,
                 cross_attention_kwargs=None,
                 added_cond_kwargs=added_cond_kwargs,
                 return_dict=False,
             )[0]
-            if self.do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
         return noise_pred
 
 
+@torch.no_grad()
 def export_unet_model(
     unet_model,
     hf_model_name,
@@ -101,11 +91,6 @@ def export_unet_model(
     input_mlir=None,
     weights_only=False,
 ):
-    if "turbo" in hf_model_name:
-        do_classifier_free_guidance = False
-    else:
-        do_classifier_free_guidance = True
-
     safe_name = utils.create_safe_name(
         hf_model_name,
         f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}",
@@ -145,49 +130,62 @@ def export_unet_model(
     if weights_only:
         return external_weight_path
 
-    sample = (
-        batch_size,
+    do_classifier_free_guidance = True
+    init_batch_dim = 2 if do_classifier_free_guidance else 1
+
+    prepared_latents = (
+        batch_size * init_batch_dim,
         unet_model.unet.config.in_channels,
         height // 8,
         width // 8,
     )
-    if do_classifier_free_guidance:
-        init_batch_dim = 2
-    else:
-        init_batch_dim = 1
 
     time_ids_shape = (init_batch_dim * batch_size, 6)
     prompt_embeds_shape = (init_batch_dim * batch_size, max_length, 2048)
     text_embeds_shape = (init_batch_dim * batch_size, 1280)
-    timestep_shape = (1,)
-    guidance_scale_shape = (1,)
-    
+    example_forward_args = [
+        torch.empty(prepared_latents, dtype=dtype),
+        torch.empty(1, dtype=dtype),  # timestep
+        torch.empty(prompt_embeds_shape, dtype=dtype),
+        torch.empty(text_embeds_shape, dtype=dtype),
+        torch.empty(time_ids_shape, dtype=dtype),
+    ]
+
+    fxb = FxProgramsBuilder(unet_model)
+
+    @fxb.export_program(
+        args=(example_forward_args,),
+    )
+    def _forward(
+        module,
+        inputs,
+    ):
+        return module.forward(*inputs)
+
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list.extend(
+            [
+                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+                torch.ops.aten._scaled_dot_product_flash_attention.default,
+            ]
+        )
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+
+        class CompiledUnet(CompiledModule):
+            run_forward = _forward
 
-    class CompiledUnet(CompiledModule):
         if external_weights:
-            params = export_parameters(
-                unet_model, external=True, external_scope="", name_mapper=mapper.get
-            )
-        else:
-            params = export_parameters(unet_model)
-
-        def main(
-            self,
-            sample=AbstractTensor(*sample, dtype=dtype),
-            timestep=AbstractTensor(1, dtype=torch.int64),
-            prompt_embeds=AbstractTensor(*prompt_embeds_shape, dtype=dtype),
-            text_embeds=AbstractTensor(*text_embeds_shape, dtype=dtype),
-            time_ids=AbstractTensor(*time_ids_shape, dtype=dtype),
-            guidance_scale=AbstractTensor(1, dtype=dtype),
-        ):
-            return jittable(unet_model.forward, decompose_ops=decomp_list)(
-                sample, timestep, prompt_embeds, text_embeds, time_ids, guidance_scale
-            )
+            externalize_module_parameters(unet_model)
+        if external_weight_path and len(external_weight_path) > 1:
+            save_module_parameters(external_weight_path, unet_model)
 
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledUnet(context=Context(), import_to=import_to)
+        inst = CompiledUnet(context=Context(), import_to="IMPORT")
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
+        module_str = str(CompiledModule.get_mlir_module(inst))
 
     if compile_to != "vmfb":
         return module_str
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 170302f6b..74fd4d421 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -9,7 +9,6 @@
 import torch
 from torch.utils import _pytree as pytree
 from shark_turbine.aot import *
-from shark_turbine.aot import decompositions
 from iree.compiler.ir import Context
 from turbine_models.custom_models.llm_optimizations.streaming_llm.modify_llama import (
     enable_llama_pos_shift_attention,
diff --git a/models/turbine_models/tests/resnet_test.py b/models/turbine_models/tests/resnet_test.py
index 1a38c2594..0cafcd2c7 100644
--- a/models/turbine_models/tests/resnet_test.py
+++ b/models/turbine_models/tests/resnet_test.py
@@ -10,6 +10,7 @@
 class Resnet18Test(unittest.TestCase):
     def testExportResnet18ModelCPU(self):
         from turbine_models.tests.testing_cmd_opts import args
+
         arguments = {
             "run_vmfb": True,
             "compile_to": "vmfb",
@@ -27,7 +28,7 @@ def testExportResnet18ModelCPU(self):
         namespace = AttributeDict(arguments)
         err = resnet_18.run_resnet_18_vmfb_comparison(resnet_model, namespace)
         assert err < 1e-5
-        
+
     def testExportResnet18ModelStaticGFX1100(self):
         arguments = {
             "run_vmfb": True,
@@ -63,9 +64,11 @@ def testExportResnet18ModelStaticGFX1100(self):
 class AttributeDict(dict):
     def __getattr__(self, attr):
         return self[attr]
+
     def __setattr__(self, attr, value):
         self[attr] = value
 
+
 if __name__ == "__main__":
     logging.basicConfig(level=logging.DEBUG)
     unittest.main()

From 8817f5c7cbb0e2e570b02ec4ed2ffe59e1f12b3a Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 10 Jun 2024 01:01:34 -0500
Subject: [PATCH 114/174] Add cpu scheduling to schedulers.py and runner, small
 fixes

---
 .../custom_models/sd_inference/schedulers.py  | 142 +++++++++++++-----
 .../custom_models/sd_inference/utils.py       |  27 ++--
 .../sdxl_scheduled_unet_runner.py             | 126 ++++++++++------
 .../custom_models/sdxl_inference/unet.py      |  73 ++++-----
 4 files changed, 232 insertions(+), 136 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 5bef5ae14..6e258077f 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -9,6 +9,7 @@
 
 import torch
 from shark_turbine.aot import *
+import shark_turbine.ops.iree as ops
 from iree.compiler.ir import Context
 import iree.runtime as ireert
 import numpy as np
@@ -42,8 +43,10 @@ def __init__(self, rt_device, vmfb):
     def initialize(self, sample):
         return self.runner.ctx.modules.compiled_scheduler["run_initialize"](sample)
 
-    def scale_model_input(self, sample, t):
-        return self.runner.ctx.modules.compiled_scheduler["run_scale"](sample, t)
+    def scale_model_input(self, sample, t, timesteps):
+        return self.runner.ctx.modules.compiled_scheduler["run_scale"](
+            sample, t, timesteps
+        )
 
     def step(self, noise_pred, t, sample, guidance_scale, step_index):
         return self.runner.ctx.modules.compiled_scheduler["run_step"](
@@ -71,9 +74,11 @@ def __init__(
         self.batch_size = batch_size
         self.do_classifier_free_guidance = True
         self.model.set_timesteps(num_inference_steps)
+        self.timesteps = self.model.timesteps
         self.model.is_scale_input_called = True
         self.dtype = dtype
 
+    # TODO: Make steps dynamic here
     def initialize(self, sample):
         height = self.height
         width = self.width
@@ -85,67 +90,106 @@ def initialize(self, sample):
         if self.do_classifier_free_guidance:
             add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
             add_time_ids = add_time_ids.repeat(self.batch_size, 1).type(self.dtype)
+        step_count = torch.tensor(len(self.timesteps))
         timesteps = self.model.timesteps
-        step_indexes = torch.tensor(len(timesteps))
+        # ops.trace_tensor("timesteps", self.timesteps)
         sample = sample * self.model.init_noise_sigma
-        return sample.type(self.dtype), add_time_ids, step_indexes
+        return (
+            sample.type(self.dtype),
+            add_time_ids,
+            step_count,
+            timesteps.type(self.dtype),
+        )
 
-    def prepare_model_input(self, sample, t):
-        t = self.model.timesteps[t]
+    def prepare_model_input(self, sample, t, timesteps):
+        t = timesteps[t]
         if self.do_classifier_free_guidance:
             latent_model_input = torch.cat([sample] * 2)
         else:
             latent_model_input = sample
-        return self.model.scale_model_input(latent_model_input, t), t.type(self.dtype)
+        return self.model.scale_model_input(latent_model_input, t).type(
+            self.dtype
+        ), t.type(self.dtype)
 
     def step(self, noise_pred, t, sample, guidance_scale, i):
         self.model._step_index = i
+
         if self.do_classifier_free_guidance:
             noise_preds = noise_pred.chunk(2)
             noise_pred = noise_preds[0] + guidance_scale * (
                 noise_preds[1] - noise_preds[0]
             )
-        sample = self.model.step(noise_pred, t, sample)[0]
+        if self.model.config.skip_prk_steps == True:
+            sample = self.model.step_plms(noise_pred, t, sample, return_dict=False)[0]
+        else:
+            sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
         return sample.type(self.dtype)
 
 
+@torch.no_grad()
 class SharkSchedulerCPUWrapper:
-    def __init__(self, pipe, scheduler):
+    def __init__(
+        self, scheduler, batch_size, num_inference_steps, dest_device, latents_dtype
+    ):
+        self.do_classifier_free_guidance = True
         self.module = scheduler
-        self.dest = pipe.runners["unet"].config.device
-        self.dtype = pipe.iree_dtype
+        self.dest = dest_device
+        self.dtype = latents_dtype
+        self.batch_size = batch_size
+        self.module.set_timesteps(num_inference_steps)
+        self.torch_dtype = (
+            torch.float32 if latents_dtype == "float32" else torch.float16
+        )
 
     def initialize(self, sample):
-        sample, add_time_ids, step_indexes = self.module.initialize(
-            torch.from_numpy(sample.to_host())
-        )
-        sample = ireert.asdevicearray(self.dest, sample, self.dtype)
+        height = sample.shape[2]
+        width = sample.shape[3]
+        original_size = (height, width)
+        target_size = (height, width)
+        crops_coords_top_left = (0, 0)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=self.torch_dtype)
+        if self.do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
+            add_time_ids = add_time_ids.repeat(self.batch_size, 1).type(
+                self.torch_dtype
+            )
+        step_indexes = torch.tensor(len(self.module.timesteps))
+        timesteps = self.module.timesteps
+        sample = sample * self.module.init_noise_sigma
         add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
+        step_indexes = ireert.asdevicearray(self.dest, step_indexes, "int64")
+        return sample, add_time_ids, step_indexes, timesteps
 
-        return sample, add_time_ids, step_indexes
-
-    def scale_model_input(self, sample, t):
-        scaled = ireert.asdevicearray(
-            self.dest,
-            self.module.scale_model_input(torch.from_numpy(sample.to_host()), t),
-            self.dtype,
-        )
-        t = [self.module.model.timesteps[t]]
-        t = ireert.asdevicearray(self.dest, t, self.dtype)
+    def scale_model_input(self, sample, t, timesteps):
+        if self.do_classifier_free_guidance:
+            sample = torch.cat([sample] * 2)
+        t = timesteps[t]
+        scaled = self.module.scale_model_input(sample, t)
+        t = ireert.asdevicearray(self.dest, [t], self.dtype)
         return scaled, t
 
-    def step(self, latents, t, sample):
-        return ireert.asdevicearray(
-            self.dest,
-            self.module.step(
-                torch.from_numpy(latents.to_host()),
-                t,
-                torch.from_numpy(sample.to_host()),
-            ).prev_sample,
-            self.dtype,
-        )
+    def step(self, latents, t, sample, guidance_scale, i):
+        if isinstance(latents, ireert.DeviceArray):
+            latents = torch.tensor(latents.to_host())
+        if isinstance(t, ireert.DeviceArray):
+            t = self.module.timesteps[i]
+        if isinstance(sample, ireert.DeviceArray):
+            sample = torch.tensor(sample.to_host())
+        if self.do_classifier_free_guidance:
+            noise_preds = latents.chunk(2)
+            latents = noise_preds[0] + guidance_scale * (
+                noise_preds[1] - noise_preds[0]
+            )
+        return self.module.step(
+            latents,
+            t,
+            sample,
+            return_dict=False,
+        )[0]
 
 
+@torch.no_grad()
 def export_scheduler_model(
     hf_model_name: str,
     scheduler_id: str,
@@ -214,10 +258,17 @@ def export_scheduler_model(
         width // 8,
     )
     example_init_args = [torch.empty(sample, dtype=dtype)]
-    example_prep_args = [
+    example_prep_args = (
         torch.empty(sample, dtype=dtype),
         torch.empty(1, dtype=torch.int64),
-    ]
+        torch.empty([19], dtype=dtype),
+    )
+    timesteps = torch.export.Dim("timesteps")
+    prep_dynamic_args = {
+        "sample": {},
+        "t": {},
+        "timesteps": {0: timesteps},
+    }
     example_step_args = [
         torch.empty(noise_pred_shape, dtype=dtype),
         torch.empty(1, dtype=dtype),
@@ -235,10 +286,11 @@ def _initialize(module, sample):
         return module.initialize(*sample)
 
     @fxb.export_program(
-        args=(example_prep_args,),
+        args=example_prep_args,
+        dynamic_shapes=prep_dynamic_args,
     )
-    def _scale(module, input):
-        return module.prepare_model_input(*input)
+    def _scale(module, sample, t, timesteps):
+        return module.prepare_model_input(sample, t, timesteps)
 
     @fxb.export_program(
         args=(example_step_args,),
@@ -292,6 +344,13 @@ def get_scheduler(model_id, scheduler_id):
         scheduler = SCHEDULER_MAP[scheduler_id].from_pretrained(
             model_id, subfolder="scheduler"
         )
+    elif all(x in scheduler_id for x in ["DPMSolverMultistep", "++"]):
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
+        )
+    if "Karras" in scheduler_id:
+        scheduler.config.use_karras_sigmas = True
+
     return scheduler
 
 
@@ -309,6 +368,9 @@ def get_scheduler(model_id, scheduler_id):
     "KDPM2AncestralDiscrete": KDPM2AncestralDiscreteScheduler,
     "HeunDiscrete": HeunDiscreteScheduler,
     "DPMSolverMultistepKarras": DPMSolverMultistepScheduler,
+    "DPMSolverMultistep": DPMSolverMultistepScheduler,
+    "DPMSolverSDE": DPMSolverSDEScheduler,
+    "DPMSolverSDEKarras": DPMSolverSDEScheduler,
 }
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 40e8051cd..36b1b3b43 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -163,15 +163,6 @@ def compile_to_vmfb(
             ["--iree-hal-dump-executable-files-to=" + safe_name + "_dispatches"]
         )
 
-    for i, flag in enumerate(ireec_flags):
-        k = flag.strip().split("=")[0]
-        for idx, default in enumerate(flags):
-            if k == default.split("=")[0]:
-                flags[idx] = flag
-                ireec_flags[i] = ""
-        if flag not in [None, "", " "]:
-            flags.append(flag)
-
     if target_triple in ["gfx940", "gfx941", "gfx942", "gfx90a"]:
         if "unet" in safe_name:
             flags.extend(MI_flags["unet"])
@@ -196,6 +187,24 @@ def compile_to_vmfb(
         if attn_spec:
             flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
 
+    for i, flag in enumerate(ireec_flags):
+        k = flag.strip().split("=")[0]
+        for idx, default in enumerate(flags):
+            if default == None:
+                flags.pop(idx)
+                continue
+            elif k == default.split("=")[0]:
+                flags[idx] = flag if flag.split("=")[-1] not in ["None", ""] else None
+                flag = None
+                if flags[idx] == None:
+                    flags.pop(idx)
+                continue
+        if flag not in [None, "", " "] and flag.split("=")[-1] not in ["None", ""]:
+            flags.append(flag)
+
+    for idx, flag in enumerate(flags):
+        if flag is None:
+            flags.pop(idx)
     print("Compiling to", device, "with flags:", flags)
 
     if mlir_source == "file":
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
index 8d8446ccf..5e90596d9 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet_runner.py
@@ -5,10 +5,12 @@
 import torch
 import numpy as np
 from tqdm.auto import tqdm
+from shark_turbine.ops.iree import trace_tensor
 
 torch.random.manual_seed(0)
 
 
+@torch.no_grad()
 def run_torch_scheduled_unet(
     sample,
     prompt_embeds,
@@ -140,20 +142,35 @@ def run_unet_split_scheduled(
     text_embeds,
     args,
 ):
+    dtype = "float16" if args.precision == "fp16" else "float32"
+    torch_dtype = torch.float16 if args.precision == "fp16" else torch.float32
     unet_runner = vmfbRunner(
         args.device,
         args.vmfb_path,
         args.external_weight_path,
     )
-    scheduler = schedulers.SharkSchedulerWrapper(
-        args.device,
-        args.scheduler_vmfb_path,
-    )
-    dtype = "float16" if args.precision == "fp16" else "float32"
-    guidance_scale = ireert.asdevicearray(
-        scheduler.runner.config.device, np.asarray([args.guidance_scale]), dtype=dtype
-    )
-    sample, time_ids, steps = scheduler.initialize(sample)
+    if not args.scheduler_vmfb_path:
+        print("--scheduler_vmfb_path not supplied. Using cpu scheduling.")
+        scheduler = schedulers.get_scheduler(args.hf_model_name, args.scheduler_id)
+        scheduler = schedulers.SharkSchedulerCPUWrapper(
+            scheduler,
+            args.batch_size,
+            args.num_inference_steps,
+            unet_runner.config.device,
+            dtype,
+        )
+        guidance_scale = torch.tensor([args.guidance_scale])
+    else:
+        scheduler = schedulers.SharkSchedulerWrapper(
+            args.device,
+            args.scheduler_vmfb_path,
+        )
+        guidance_scale = ireert.asdevicearray(
+            scheduler.runner.config.device,
+            np.asarray([args.guidance_scale]),
+            dtype=dtype,
+        )
+    sample, time_ids, steps, timesteps = scheduler.initialize(sample)
     iree_inputs = [
         sample,
         ireert.asdevicearray(unet_runner.config.device, prompt_embeds),
@@ -162,13 +179,17 @@ def run_unet_split_scheduled(
         None,
     ]
     for i in range(steps.to_host()):
-        print(f"step {i}")
-        step_index = ireert.asdevicearray(
-            unet_runner.config.device, torch.tensor([i]), dtype="int64"
-        )
+        # print(f"step {i}")
+        if args.scheduler_vmfb_path:
+            step_index = ireert.asdevicearray(
+                unet_runner.config.device, torch.tensor([i]), dtype="int64"
+            )
+        else:
+            step_index = i
         latents, t = scheduler.scale_model_input(
             sample,
             step_index,
+            timesteps,
         )
         noise_pred = unet_runner.ctx.modules.compiled_unet["run_forward"](
             latents,
@@ -180,13 +201,14 @@ def run_unet_split_scheduled(
         sample = scheduler.step(
             noise_pred,
             t,
-            iree_inputs[0],
+            sample,
             guidance_scale,
             step_index,
         )
     return sample
 
 
+@torch.no_grad()
 def run_torch_diffusers_loop(
     sample,
     prompt_embeds,
@@ -200,40 +222,48 @@ def run_torch_diffusers_loop(
         args.hf_auth_token,
         precision="fp32",
     )
-    scheduler = utils.get_schedulers(args.hf_model_name)[args.scheduler_id]
-
+    scheduler = schedulers.get_scheduler(args.hf_model_name, args.scheduler_id)
+    if args.scheduler_id == "PNDM":
+        scheduler.config.skip_prk_steps = True
     scheduler.set_timesteps(args.num_inference_steps)
-    scheduler.is_scale_input_called = True
+    timesteps = scheduler.timesteps
+    print(timesteps)
     sample = sample * scheduler.init_noise_sigma
 
-    height = sample.shape[-2] * 8
-    width = sample.shape[-1] * 8
+    height = args.height
+    width = args.width
     original_size = (height, width)
     target_size = (height, width)
     crops_coords_top_left = (0, 0)
 
     add_time_ids = list(original_size + crops_coords_top_left + target_size)
-    add_time_ids = torch.tensor([add_time_ids, add_time_ids], dtype=torch.float32)
+    add_time_ids = torch.tensor([add_time_ids], dtype=torch.float32)
+    add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
     add_time_ids = add_time_ids.repeat(args.batch_size * 1, 1)
     sample = sample.to(torch.float32)
     prompt_embeds = prompt_embeds.to(torch.float32)
     text_embeds = text_embeds.to(torch.float32)
 
-    for idx, i in enumerate(scheduler.timesteps):
-        timestep = i
-
-        latent_model_input = scheduler.scale_model_input(sample, timestep)
+    for idx, t in enumerate(timesteps):
+        print(t)
+        latent_model_input = torch.cat([sample] * 2)
+        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
         noise_pred = unet_model.forward(
             latent_model_input,
-            timestep,
+            t,
             prompt_embeds,
             text_embeds,
             add_time_ids,
-            args.guidance_scale,
+        )
+        # print("NOISE_PRED: ", noise_pred)
+        # print("STEP_INDEX : ", idx)
+        noise_preds = noise_pred.chunk(2)
+        noise_pred = noise_preds[0] + args.guidance_scale * (
+            noise_preds[1] - noise_preds[0]
         )
         sample = scheduler.step(
             noise_pred,
-            timestep,
+            t,
             sample,
             return_dict=False,
         )[0]
@@ -258,7 +288,7 @@ def run_torch_diffusers_loop(
         init_batch_dim * args.batch_size, args.max_length, 2048, dtype=dtype
     )
     text_embeds = torch.rand(init_batch_dim * args.batch_size, 1280, dtype=dtype)
-    time_ids = torch.rand(init_batch_dim * args.batch_size, 6)
+    time_ids = torch.rand(init_batch_dim * args.batch_size, 6, dtype=dtype)
     if args.compiled_pipeline:
         assert (
             args.pipeline_vmfb_path is not None
@@ -275,22 +305,23 @@ def run_torch_diffusers_loop(
             turbine_compiled_output.shape,
             turbine_compiled_output.dtype,
         )
+        turbine_output = turbine_compiled_output
     elif args.split_scheduler:
-        assert (
-            args.scheduler_vmfb_path is not None
-        ), "--scheduler_vmfb_path is required for split scheduler run"
         turbine_split_output = run_unet_split_scheduled(
             sample,
             prompt_embeds,
             text_embeds,
             args,
-        ).to_host()
+        )
+        if args.scheduler_vmfb_path:
+            turbine_split_output = turbine_split_output.to_host()
         print(
             "TURBINE SPLIT OUTPUT:",
             turbine_split_output,
             turbine_split_output.shape,
             turbine_split_output.dtype,
         )
+        turbine_output = turbine_split_output
     else:
         turbine_python_output = run_scheduled_unet_python(
             sample,
@@ -304,12 +335,19 @@ def run_torch_diffusers_loop(
             turbine_python_output.shape,
             turbine_python_output.dtype,
         )
+        turbine_output = turbine_python_output
 
     if args.compare_vs_torch:
-        from turbine_models.custom_models.sd_inference import utils
-
+        if args.scheduler_id == "EulerAncestralDiscrete" and args.scheduler_vmfb_path:
+            print(
+                f"WARNING: {args.scheduler_id} scheduler adds random noise to results and we haven't piped through a torch generator yet to fix the seed. Expect mismatch results."
+            )
+        if args.scheduler_id == "PNDM" and args.scheduler_vmfb_path:
+            print(
+                f"WARNING: {args.scheduler_id} scheduler normally uses data-dependent control flow with counters and other data dependence. Expect different results after 1 step."
+            )
         print("generating torch output: ")
-        torch_output = run_torch_scheduled_unet(
+        torch_output = run_torch_diffusers_loop(
             sample,
             prompt_embeds,
             text_embeds,
@@ -317,21 +355,15 @@ def run_torch_diffusers_loop(
         )
         print("torch OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
 
-        print("\n(torch sched unet loop to iree python loop): ")
+        print("\n(torch (diffusers) image latents to iree image latents): ")
         try:
             np.testing.assert_allclose(
-                turbine_python_output, torch_output, rtol=4e-2, atol=4e-2
+                turbine_output, torch_output, rtol=4e-2, atol=4e-2
             )
             print("passed!")
         except AssertionError as err:
-            print(err)
-
-        if args.compiled_pipeline:
-            print("\n(torch sched unet loop to iree compiled loop): ")
-            try:
-                np.testing.assert_allclose(
-                    turbine_compiled_output, torch_output, rtol=4e-2, atol=4e-2
+            if args.scheduler_id == "EulerAncestralDiscrete":
+                print(
+                    "Expected failure matching numerics due to intentionally random noise in results."
                 )
-                print("passed!")
-            except AssertionError as err:
-                print(err)
+            print(err)
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 6665b18a7..5f4ce43bb 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -52,20 +52,21 @@ def __init__(self, hf_model_name, hf_auth_token=None, precision="fp32"):
         # else:
         self.do_classifier_free_guidance = True
 
-    def forward(self, latents, timestep, prompt_embeds, text_embeds, time_ids):
-        with torch.no_grad():
-            added_cond_kwargs = {
-                "text_embeds": text_embeds,
-                "time_ids": time_ids,
-            }
-            noise_pred = self.unet.forward(
-                latents,
-                timestep,
-                encoder_hidden_states=prompt_embeds,
-                cross_attention_kwargs=None,
-                added_cond_kwargs=added_cond_kwargs,
-                return_dict=False,
-            )[0]
+    def forward(
+        self, latent_model_input, timestep, prompt_embeds, text_embeds, time_ids
+    ):
+        added_cond_kwargs = {
+            "text_embeds": text_embeds,
+            "time_ids": time_ids,
+        }
+        noise_pred = self.unet.forward(
+            latent_model_input,
+            timestep,
+            encoder_hidden_states=prompt_embeds,
+            cross_attention_kwargs=None,
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )[0]
         return noise_pred
 
 
@@ -95,6 +96,8 @@ def export_unet_model(
         hf_model_name,
         f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}",
     )
+    if args.decomp_attn == True:
+        ireec_flags += ",--iree-opt-aggressively-propagate-transposes=False"
 
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -110,14 +113,6 @@ def export_unet_model(
         return vmfb_path
 
     mapper = {}
-    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
-    if decomp_attn == True:
-        decomp_list.extend(
-            [
-                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-                torch.ops.aten._scaled_dot_product_flash_attention.default,
-            ]
-        )
     dtype = torch.float16 if precision == "fp16" else torch.float32
 
     if precision == "fp16":
@@ -145,35 +140,33 @@ def export_unet_model(
     text_embeds_shape = (init_batch_dim * batch_size, 1280)
     example_forward_args = [
         torch.empty(prepared_latents, dtype=dtype),
-        torch.empty(1, dtype=dtype),  # timestep
+        torch.empty(1, dtype=dtype),
         torch.empty(prompt_embeds_shape, dtype=dtype),
         torch.empty(text_embeds_shape, dtype=dtype),
         torch.empty(time_ids_shape, dtype=dtype),
     ]
 
-    fxb = FxProgramsBuilder(unet_model)
-
-    @fxb.export_program(
-        args=(example_forward_args,),
-    )
-    def _forward(
-        module,
-        inputs,
-    ):
-        return module.forward(*inputs)
-
     decomp_list = []
     if decomp_attn == True:
-        decomp_list.extend(
-            [
-                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-                torch.ops.aten._scaled_dot_product_flash_attention.default,
-            ]
-        )
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
     with decompositions.extend_aot_decompositions(
         from_current=True,
         add_ops=decomp_list,
     ):
+        fxb = FxProgramsBuilder(unet_model)
+
+        @fxb.export_program(
+            args=(example_forward_args,),
+        )
+        def _forward(
+            module,
+            inputs,
+        ):
+            return module.forward(*inputs)
 
         class CompiledUnet(CompiledModule):
             run_forward = _forward

From 9b976667a1e861b8c3ae05699843d446d1f0e04d Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 10 Jun 2024 01:03:10 -0500
Subject: [PATCH 115/174] Bump diffusers fork version.

---
 models/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index 6744d3238..b775c76cd 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -4,7 +4,7 @@ shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 transformers==4.37.1
 torchsde
 accelerate
-diffusers @ git+https://github.com/nod-ai/diffusers@v0.24.0-release
+diffusers @ git+https://github.com/nod-ai/diffusers@v0.28.2-shark
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
 azure-storage-blob

From 2500d14bfba54123479c18f4a792dc057b0d2481 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 10 Jun 2024 20:12:17 -0500
Subject: [PATCH 116/174] Fixes for cpu schedulers, add split scheduler support
 to sdxl pipeline

---
 .../custom_models/sd_inference/schedulers.py  |  82 +++---
 .../custom_models/sd_inference/utils.py       |   1 -
 .../sdxl_inference/sdxl_cmd_opts.py           |   7 +
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 236 +++++++++++++++---
 .../sdxl_inference/sdxl_scheduled_unet.py     |   2 -
 .../custom_models/sdxl_inference/unet.py      |  27 +-
 6 files changed, 271 insertions(+), 84 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 6e258077f..a88a7ad3c 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -41,7 +41,8 @@ def __init__(self, rt_device, vmfb):
         self.runner = vmfbRunner(rt_device, vmfb, None)
 
     def initialize(self, sample):
-        return self.runner.ctx.modules.compiled_scheduler["run_initialize"](sample)
+        sample, time_ids, steps, timesteps = self.runner.ctx.modules.compiled_scheduler["run_initialize"](sample)
+        return sample, time_ids, steps.to_host(), timesteps
 
     def scale_model_input(self, sample, t, timesteps):
         return self.runner.ctx.modules.compiled_scheduler["run_scale"](
@@ -49,6 +50,11 @@ def scale_model_input(self, sample, t, timesteps):
         )
 
     def step(self, noise_pred, t, sample, guidance_scale, step_index):
+        print(
+            noise_pred.to_host()[:,:,0,2],
+            t,
+            sample.to_host()[:,:,0,2],
+        )
         return self.runner.ctx.modules.compiled_scheduler["run_step"](
             noise_pred, t, sample, guidance_scale, step_index
         )
@@ -98,7 +104,7 @@ def initialize(self, sample):
             sample.type(self.dtype),
             add_time_ids,
             step_count,
-            timesteps.type(self.dtype),
+            timesteps.type(torch.float32),
         )
 
     def prepare_model_input(self, sample, t, timesteps):
@@ -119,15 +125,11 @@ def step(self, noise_pred, t, sample, guidance_scale, i):
             noise_pred = noise_preds[0] + guidance_scale * (
                 noise_preds[1] - noise_preds[0]
             )
-        if self.model.config.skip_prk_steps == True:
-            sample = self.model.step_plms(noise_pred, t, sample, return_dict=False)[0]
-        else:
-            sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
+        sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
         return sample.type(self.dtype)
 
-
-@torch.no_grad()
 class SharkSchedulerCPUWrapper:
+    @torch.no_grad()
     def __init__(
         self, scheduler, batch_size, num_inference_steps, dest_device, latents_dtype
     ):
@@ -137,13 +139,16 @@ def __init__(
         self.dtype = latents_dtype
         self.batch_size = batch_size
         self.module.set_timesteps(num_inference_steps)
+        self.timesteps = self.module.timesteps
         self.torch_dtype = (
             torch.float32 if latents_dtype == "float32" else torch.float16
         )
 
     def initialize(self, sample):
-        height = sample.shape[2]
-        width = sample.shape[3]
+        if isinstance(sample, ireert.DeviceArray):
+            sample = torch.tensor(sample.to_host(), dtype=torch.float32)
+        height = sample.shape[2] * 8
+        width = sample.shape[3] * 8
         original_size = (height, width)
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
@@ -155,10 +160,10 @@ def initialize(self, sample):
                 self.torch_dtype
             )
         step_indexes = torch.tensor(len(self.module.timesteps))
-        timesteps = self.module.timesteps
+        timesteps = self.timesteps
         sample = sample * self.module.init_noise_sigma
+        print(sample, add_time_ids, step_indexes, timesteps)
         add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
-        step_indexes = ireert.asdevicearray(self.dest, step_indexes, "int64")
         return sample, add_time_ids, step_indexes, timesteps
 
     def scale_model_input(self, sample, t, timesteps):
@@ -167,24 +172,27 @@ def scale_model_input(self, sample, t, timesteps):
         t = timesteps[t]
         scaled = self.module.scale_model_input(sample, t)
         t = ireert.asdevicearray(self.dest, [t], self.dtype)
+        scaled = ireert.asdevicearray(self.dest, scaled, self.dtype)
         return scaled, t
 
-    def step(self, latents, t, sample, guidance_scale, i):
-        if isinstance(latents, ireert.DeviceArray):
-            latents = torch.tensor(latents.to_host())
+    def step(self, noise_pred, t, latents, guidance_scale, i):
         if isinstance(t, ireert.DeviceArray):
-            t = self.module.timesteps[i]
-        if isinstance(sample, ireert.DeviceArray):
-            sample = torch.tensor(sample.to_host())
+            t = torch.tensor(t.to_host())
+        if isinstance(guidance_scale, ireert.DeviceArray):
+            guidance_scale = torch.tensor(guidance_scale.to_host())
+        noise_pred = torch.tensor(noise_pred.to_host())
         if self.do_classifier_free_guidance:
-            noise_preds = latents.chunk(2)
-            latents = noise_preds[0] + guidance_scale * (
-                noise_preds[1] - noise_preds[0]
-            )
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        print(
+            noise_pred[:,:,0,2],
+            t,
+            latents[:,:,0,2],
+        )
         return self.module.step(
-            latents,
+            noise_pred,
             t,
-            sample,
+            latents,
             return_dict=False,
         )[0]
 
@@ -212,19 +220,23 @@ def export_scheduler_model(
     scheduler_module = SchedulingModel(
         hf_model_name, scheduler, height, width, batch_size, num_inference_steps, dtype
     )
-    vmfb_names = [
-        scheduler_id + "Scheduler",
-        f"bs{batch_size}",
-        f"{height}x{width}",
-        precision,
-        str(num_inference_steps),
-        target_triple,
-    ]
-    vmfb_name = "_".join(vmfb_names)
-
     if pipeline_dir:
+        vmfb_names = [
+            scheduler_id + "Scheduler",
+            str(num_inference_steps),
+        ]
+        vmfb_name = "_".join(vmfb_names)
         safe_name = os.path.join(pipeline_dir, vmfb_name)
     else:
+        vmfb_names = [
+            scheduler_id + "Scheduler",
+            f"bs{batch_size}",
+            f"{height}x{width}",
+            precision,
+            str(num_inference_steps),
+            target_triple,
+        ]
+        vmfb_name = "_".join(vmfb_names)
         safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
 
     if input_mlir:
@@ -261,7 +273,7 @@ def export_scheduler_model(
     example_prep_args = (
         torch.empty(sample, dtype=dtype),
         torch.empty(1, dtype=torch.int64),
-        torch.empty([19], dtype=dtype),
+        torch.empty([19], dtype=torch.float32),
     )
     timesteps = torch.export.Dim("timesteps")
     prep_dynamic_args = {
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 36b1b3b43..840c8bd1a 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -136,7 +136,6 @@ def compile_to_vmfb(
                 "--iree-hal-target-backends=rocm",
                 "--iree-rocm-target-chip=" + target_triple,
                 "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                "--iree-flow-inline-constants-max-byte-length=1",
             ]
         )
         if target_triple == "gfx942":
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
index 410aa91af..7acf5d528 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
@@ -130,6 +130,13 @@ def is_valid_file(arg):
     help="Use a decoupled unet and scheduler for better QOL.",
 )
 
+p.add_argument(
+    "--cpu_scheduling",
+    default=False,
+    action="store_true",
+    help="Run scheduling on torch cpu (will be slower due to data movement costs).",
+)
+
 p.add_argument(
     "--external_weight_file",
     type=str,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 4cea924e0..7a2c3eefb 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -10,9 +10,10 @@
     sdxl_prompt_encoder,
     sdxl_scheduled_unet,
     vae,
+    unet,
 )
 import iree.runtime as ireert
-from turbine_models.custom_models.sd_inference import utils
+from turbine_models.custom_models.sd_inference import utils, schedulers
 from turbine_models.custom_models.sdxl_inference.pipeline_ir import (
     get_pipeline_ir,
 )
@@ -24,6 +25,7 @@
 import os
 import numpy as np
 import time
+import copy
 from datetime import datetime as dt
 
 device_list = [
@@ -79,6 +81,7 @@ def __init__(
         external_weights: str = "safetensors",
         vae_decomp_attn: bool = True,
         custom_vae: str = "",
+        cpu_scheduling: bool = False,
     ):
         self.hf_model_name = hf_model_name
         self.scheduler_id = scheduler_id
@@ -98,6 +101,7 @@ def __init__(
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
         self.custom_vae = custom_vae
+        self.cpu_scheduling = cpu_scheduling
         # TODO: set this based on user-inputted guidance scale and negative prompt.
         self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
@@ -123,11 +127,12 @@ def check_prepared(
             if do_continue.lower() == "y":
                 for submodel in vmfbs.keys():
                     if vmfbs[submodel] == None:
+                        print(submodel)
                         vmfb, weight = self.export_submodel(submodel, input_mlir=mlirs)
                         vmfbs[submodel] = vmfb
                         if weights[submodel] is None:
                             weights[submodel] = weight
-                    elif weights[submodel] is None and "pipeline" not in submodel:
+                    elif weights[submodel] is None and not any(x in submodel for x in ["pipeline", "scheduler"]):
                         _, weight = self.export_submodel(submodel, weights_only=True)
                         weights[submodel] = weight
                 ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
@@ -147,6 +152,13 @@ def is_prepared(self, vmfbs, weights):
             if key == "scheduled_unet":
                 val = f"{self.scheduler_id}_unet_{self.num_inference_steps}"
                 default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
+            elif key == "scheduler" and not self.cpu_scheduling:
+                val = f"{self.scheduler_id}Scheduler_{self.num_inference_steps}"
+                default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
+            elif key == "scheduler":
+                val = None
+                default_filepath=None
+                continue
             else:
                 val = vmfbs[key]
                 default_filepath = os.path.join(self.pipeline_dir, key + ".vmfb")
@@ -159,7 +171,7 @@ def is_prepared(self, vmfbs, weights):
             else:
                 missing.append(val + ".vmfb")
         for w_key in weights:
-            if "pipeline" in w_key:
+            if any(x in w_key for x in ["pipeline", "scheduler"]):
                 continue
             if weights[w_key] is not None:
                 continue
@@ -170,6 +182,12 @@ def is_prepared(self, vmfbs, weights):
             )
             if weights[w_key] is None and os.path.exists(default_name):
                 weights[w_key] = os.path.join(default_name)
+            elif w_key in ["scheduled_unet"] and os.path.exists(
+                os.path.join(self.external_weights_dir, "unet." + self.external_weights)
+            ):
+                weights[w_key] = os.path.join(
+                    self.external_weights_dir, "unet." + self.external_weights
+                )
             else:
                 missing.append(w_key + "." + self.external_weights)
         if len(missing) > 0:
@@ -219,6 +237,13 @@ def get_torch_models(self, submodel):
                     ),
                 )
                 return vae_torch
+            case "unet":
+                unet_torch = unet.UnetModel(
+                    self.hf_model_name,
+                    None,
+                    self.precision,
+                )
+                return unet_torch
 
     def export_submodel(
         self,
@@ -230,12 +255,12 @@ def export_submodel(
             os.makedirs(self.pipeline_dir)
         if self.external_weights and self.external_weights_dir:
             if not os.path.exists(self.external_weights_dir):
-                os.makedirs(external_weights_dir, exist_ok=True)
+                os.makedirs(self.external_weights_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
                 self.external_weights_dir, "vae_decode." + self.external_weights
             )
             unet_external_weight_path = os.path.join(
-                self.external_weights_dir, "scheduled_unet." + self.external_weights
+                self.external_weights_dir, "unet." + self.external_weights
             )
             prompt_encoder_external_weight_path = os.path.join(
                 self.external_weights_dir, "prompt_encoder." + self.external_weights
@@ -258,7 +283,7 @@ def export_submodel(
                 self.pipeline_dir, "vae_decode." + self.external_weights
             )
             unet_external_weight_path = os.path.join(
-                self.pipeline_dir, "scheduled_unet." + self.external_weights
+                self.pipeline_dir, "unet." + self.external_weights
             )
             prompt_encoder_external_weight_path = os.path.join(
                 self.pipeline_dir, "prompt_encoder." + self.external_weights
@@ -268,6 +293,7 @@ def export_submodel(
                 "vae_decode": None,
                 "prompt_encoder": None,
                 "scheduled_unet": None,
+                "unet": None,
                 "pipeline": None,
                 "full_pipeline": None,
             }
@@ -301,7 +327,58 @@ def export_submodel(
                     input_mlir=input_mlir["scheduled_unet"],
                     weights_only=weights_only,
                 )
+                del scheduled_unet_torch
                 return unet_vmfb, unet_external_weight_path
+            case "unet":
+                if not input_mlir[submodel]:
+                    unet_torch = self.get_torch_models("unet")
+                else:
+                    unet_torch = None
+                unet_vmfb = unet.export_unet_model(
+                    unet_torch,
+                    self.hf_model_name,
+                    self.batch_size,
+                    self.height,
+                    self.width,
+                    self.precision,
+                    self.max_length,
+                    None,
+                    "vmfb",
+                    self.external_weights,
+                    unet_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["unet"],
+                    self.decomp_attn,
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    attn_spec=self.attn_spec,
+                    input_mlir=input_mlir["unet"],
+                    weights_only=weights_only,
+                )
+                del unet_torch
+                return unet_vmfb, unet_external_weight_path
+            case "scheduler":
+                if self.cpu_scheduling:
+                    return None, None
+                else:
+                    scheduler_vmfb = schedulers.export_scheduler_model(
+                        self.hf_model_name,
+                        self.scheduler_id,
+                        self.batch_size,
+                        self.height,
+                        self.width,
+                        self.num_inference_steps,
+                        self.precision,
+                        "vmfb",
+                        self.device,
+                        self.iree_target_triple,
+                        self.ireec_flags["scheduler"],
+                        exit_on_vmfb=False,
+                        pipeline_dir=self.pipeline_dir,
+                        input_mlir=input_mlir["scheduler"],
+                    )
+                    return scheduler_vmfb, None
             case "vae_decode":
                 if not input_mlir[submodel]:
                     vae_torch = self.get_torch_models("vae_decode")
@@ -328,6 +405,7 @@ def export_submodel(
                     input_mlir=input_mlir["vae_decode"],
                     weights_only=weights_only,
                 )
+                del vae_torch
                 return vae_decode_vmfb, vae_external_weight_path
             case "prompt_encoder":
                 _, prompt_encoder_vmfb = sdxl_prompt_encoder.export_prompt_encoder(
@@ -395,11 +473,52 @@ def load_pipeline(
         vmfbs: dict,
         weights: dict,
         rt_device: str = "local-task",
-        compiled_pipeline: bool = True,
+        compiled_pipeline: bool = False,
+        split_scheduler: bool = True,
     ):
         self.runners = {}
         runners = {}
-        if compiled_pipeline:
+        load_start = time.time()
+        if split_scheduler:
+            runners["pipe"] = vmfbRunner(
+                rt_device,
+                vmfbs["unet"],
+                weights["unet"],
+            )
+            unet_loaded = time.time()
+            print("\n[LOG] Unet loaded in ", unet_loaded - load_start, "sec")
+            if not self.cpu_scheduling:
+                runners["scheduler"] = schedulers.SharkSchedulerWrapper(
+                    args.device,
+                    vmfbs["scheduler"],
+                )
+            else:
+                print("\n[LOG] Running scheduler on CPU. This will affect performance.")
+                scheduler = schedulers.get_scheduler(args.hf_model_name, args.scheduler_id)
+                runners["scheduler"] = schedulers.SharkSchedulerCPUWrapper(
+                    scheduler,
+                    args.batch_size,
+                    args.num_inference_steps,
+                    runners["pipe"].config.device,
+                    latents_dtype="float32" if args.precision == "fp32" else "float16",
+                )
+            sched_loaded = time.time()
+            print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
+            runners["vae_decode"] = vmfbRunner(
+                rt_device,
+                vmfbs["vae_decode"],
+                weights["vae_decode"],
+            )
+            vae_loaded = time.time()
+            print("\n[LOG] VAE Decode loaded in ", vae_loaded - sched_loaded, "sec")
+            runners["prompt_encoder"] = vmfbRunner(
+                rt_device,
+                vmfbs["prompt_encoder"],
+                weights["prompt_encoder"],
+            )
+            clip_loaded = time.time()
+            print("\n[LOG] CLIP loaded in ", clip_loaded - vae_loaded, "sec")
+        elif compiled_pipeline:
             runners["pipe"] = vmfbRunner(
                 rt_device,
                 [
@@ -415,6 +534,9 @@ def load_pipeline(
                     None,
                 ],
             )
+            pipe_loaded = time.time()
+            print("\n[LOG] Compiled Pipeline loaded in ", pipe_loaded - load_start, "sec")
+
         else:
             runners["pipe"] = vmfbRunner(
                 rt_device,
@@ -433,6 +555,9 @@ def load_pipeline(
             )
             runners["vae_decode"] = runners["pipe"]
             runners["prompt_encoder"] = runners["pipe"]
+            pipe_loaded = time.time()
+            print("\n[LOG] Compiled Pipeline loaded in ", pipe_loaded - load_start, "sec")
+        tok_start = time.time()
         runners["tokenizer_1"] = CLIPTokenizer.from_pretrained(
             self.hf_model_name,
             subfolder="tokenizer",
@@ -441,6 +566,8 @@ def load_pipeline(
             self.hf_model_name,
             subfolder="tokenizer_2",
         )
+        tok_loaded = time.time()
+        print("\n[LOG] Tokenizers loaded in ", tok_loaded - tok_start, "sec")
         self.runners = runners
         self.compiled_pipeline = compiled_pipeline
         print("Successfully loaded pipeline.")
@@ -576,9 +703,50 @@ def generate_images(
 
             for i in range(batch_count):
                 unet_start = time.time()
-                latents = self.runners["pipe"].ctx.modules.sdxl_compiled_pipeline[
-                    "produce_image_latents"
-                ](samples[i], prompt_embeds, add_text_embeds, guidance_scale)
+                if self.runners["scheduler"]:
+                    sample, time_ids, steps, timesteps = self.runners["scheduler"].initialize(samples[i])
+                    iree_inputs = [
+                        sample,
+                        ireert.asdevicearray(self.runners["pipe"].config.device, prompt_embeds),
+                        ireert.asdevicearray(self.runners["pipe"].config.device, add_text_embeds),
+                        time_ids,
+                        None,
+                    ]
+                    for s in range(steps):
+                        # print(f"step {s}")
+                        if self.cpu_scheduling:
+                            step_index = s
+                        else:
+                            step_index = ireert.asdevicearray(self.runners["scheduler"].runner.config.device, torch.tensor([s]), "int64")
+                        latents, t = self.runners["scheduler"].scale_model_input(
+                            sample,
+                            step_index,
+                            timesteps,
+                        )
+                        noise_pred = self.runners["pipe"].ctx.modules.compiled_unet["run_forward"](
+                            latents,
+                            t,
+                            iree_inputs[1],
+                            iree_inputs[2],
+                            iree_inputs[3],
+                        )
+                        sample = self.runners["scheduler"].step(
+                            noise_pred,
+                            t,
+                            sample,
+                            guidance_scale,
+                            step_index,
+                        )
+                    if isinstance(sample, torch.Tensor):
+                        #TODO: pipe an option for vae_dtype
+                        vae_dtype = "float32" if self.precision == "fp32" else "float16"
+                        latents = ireert.asdevicearray(self.runners["vae_decode"].config.device, sample, dtype=vae_dtype)
+                    else:
+                        latents = sample
+                else:
+                    latents = self.runners["pipe"].ctx.modules.sdxl_compiled_pipeline[
+                        "produce_image_latents"
+                    ](samples[i], prompt_embeds, add_text_embeds, guidance_scale)
 
                 vae_start = time.time()
                 vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
@@ -613,7 +781,6 @@ def generate_images(
             end = time.time()
             print("Total CLIP time:", encode_prompts_end - encode_prompts_start, "sec")
             print("Total tokenize time:", encode_prompts_start - tokenize_start, "sec")
-            print("Loading time: ", encode_prompts_start - pipe_start, "sec")
             if batch_count > 1:
                 print(
                     f"Total inference time ({batch_count} batch(es)):",
@@ -666,43 +833,36 @@ def numpy_to_pil_image(images):
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
-
-    mlirs = {
-        "prompt_encoder": None,
-        "scheduled_unet": None,
-        "vae_decode": None,
-        "pipeline": None,
-        "full_pipeline": None,
-    }
-    vmfbs = {
-        "prompt_encoder": None,
-        "scheduled_unet": None,
-        "vae_decode": None,
-        "pipeline": None,
-        "full_pipeline": None,
-    }
-    weights = {
-        "prompt_encoder": None,
-        "scheduled_unet": None,
-        "vae_decode": None,
-        "pipeline": None,
-        "full_pipeline": None,
-    }
+    map = empty_pipe_dict
+    if args.split_scheduler:
+        map["scheduler"] = None
+        map["unet"] = None
+        map.pop("scheduled_unet")
+        map.pop("pipeline")
+        map.pop("full_pipeline")
+        if args.cpu_scheduling:
+            map.pop("scheduler")
+    mlirs = copy.deepcopy(map)
+    vmfbs = copy.deepcopy(map)
+    weights = copy.deepcopy(map)
     ireec_flags = {
         "clip": args.ireec_flags + args.clip_flags,
         "unet": args.ireec_flags + args.unet_flags,
         "vae": args.ireec_flags + args.vae_flags,
         "pipeline": args.ireec_flags,
+        "scheduler": args.ireec_flags,
     }
     if not args.pipeline_dir:
         pipe_id_list = [
-            "sdxl_1_0",
+            args.hf_model_name.split("/")[-1],
             str(args.height),
             str(args.width),
             str(args.max_length),
             args.precision,
             args.device,
         ]
+        if args.decomp_attn:
+            pipe_id_list.append("decomp")
         args.pipeline_dir = os.path.join(
             ".",
             "_".join(pipe_id_list),
@@ -734,9 +894,13 @@ def numpy_to_pil_image(images):
         args.external_weights_dir,
         args.external_weights,
         args.vae_decomp_attn,
+        custom_vae = None,
+        cpu_scheduling = args.cpu_scheduling,
     )
     vmfbs, weights = sdxl_pipe.check_prepared(mlirs, vmfbs, weights)
-    sdxl_pipe.load_pipeline(vmfbs, weights, args.rt_device, args.compiled_pipeline)
+    if args.cpu_scheduling:
+        vmfbs["scheduler"] = None
+    sdxl_pipe.load_pipeline(vmfbs, weights, args.rt_device, args.compiled_pipeline, args.split_scheduler)
     sdxl_pipe.generate_images(
         args.prompt,
         args.negative_prompt,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 3a833b131..21597d457 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -267,8 +267,6 @@ class CompiledScheduledUnet(CompiledModule):
 
         if external_weights:
             externalize_module_parameters(scheduled_unet_model)
-        if external_weight_path and len(external_weight_path) > 1:
-            save_module_parameters(external_weight_path, scheduled_unet_model)
 
         inst = CompiledScheduledUnet(context=Context(), import_to="IMPORT")
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 5f4ce43bb..dd42ce457 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -88,15 +88,21 @@ def export_unet_model(
     ireec_flags=None,
     decomp_attn=False,
     exit_on_vmfb=False,
+    pipeline_dir=None,
     attn_spec=None,
     input_mlir=None,
     weights_only=False,
 ):
-    safe_name = utils.create_safe_name(
-        hf_model_name,
-        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}",
-    )
-    if args.decomp_attn == True:
+    if pipeline_dir:
+        safe_name = os.path.join(
+            pipeline_dir, f"unet"
+        )
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name,
+            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet",
+        )
+    if decomp_attn == True:
         ireec_flags += ",--iree-opt-aggressively-propagate-transposes=False"
 
     if input_mlir:
@@ -105,7 +111,7 @@ def export_unet_model(
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
@@ -173,8 +179,6 @@ class CompiledUnet(CompiledModule):
 
         if external_weights:
             externalize_module_parameters(unet_model)
-        if external_weight_path and len(external_weight_path) > 1:
-            save_module_parameters(external_weight_path, unet_model)
 
         inst = CompiledUnet(context=Context(), import_to="IMPORT")
 
@@ -183,15 +187,18 @@ class CompiledUnet(CompiledModule):
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(
+        vmfb_path = utils.compile_to_vmfb(
             module_str,
             device,
             target_triple,
             ireec_flags,
             safe_name,
-            return_path=False,
+            return_path=True,
             attn_spec=attn_spec,
         )
+        if exit_on_vmfb:
+            exit()
+    return vmfb_path
 
 
 if __name__ == "__main__":

From 52473eb7fc85a3d6f79b64ab03bd6fe24c7fac23 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 11 Jun 2024 11:30:41 -0500
Subject: [PATCH 117/174] Start fixing tests

---
 .../custom_models/sd_inference/schedulers.py  |  18 +-
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  57 +++++--
 .../sdxl_prompt_encoder_runner.py             | 113 +++++--------
 .../custom_models/sdxl_inference/unet.py      |   4 +-
 models/turbine_models/tests/sdxl_test.py      | 160 +++++++-----------
 models/turbine_models/utils/sdxl_benchmark.py |   2 +
 6 files changed, 161 insertions(+), 193 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index a88a7ad3c..ec58a0d64 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -41,7 +41,9 @@ def __init__(self, rt_device, vmfb):
         self.runner = vmfbRunner(rt_device, vmfb, None)
 
     def initialize(self, sample):
-        sample, time_ids, steps, timesteps = self.runner.ctx.modules.compiled_scheduler["run_initialize"](sample)
+        sample, time_ids, steps, timesteps = self.runner.ctx.modules.compiled_scheduler[
+            "run_initialize"
+        ](sample)
         return sample, time_ids, steps.to_host(), timesteps
 
     def scale_model_input(self, sample, t, timesteps):
@@ -50,11 +52,6 @@ def scale_model_input(self, sample, t, timesteps):
         )
 
     def step(self, noise_pred, t, sample, guidance_scale, step_index):
-        print(
-            noise_pred.to_host()[:,:,0,2],
-            t,
-            sample.to_host()[:,:,0,2],
-        )
         return self.runner.ctx.modules.compiled_scheduler["run_step"](
             noise_pred, t, sample, guidance_scale, step_index
         )
@@ -128,6 +125,7 @@ def step(self, noise_pred, t, sample, guidance_scale, i):
         sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
         return sample.type(self.dtype)
 
+
 class SharkSchedulerCPUWrapper:
     @torch.no_grad()
     def __init__(
@@ -183,11 +181,13 @@ def step(self, noise_pred, t, latents, guidance_scale, i):
         noise_pred = torch.tensor(noise_pred.to_host())
         if self.do_classifier_free_guidance:
             noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
         print(
-            noise_pred[:,:,0,2],
+            noise_pred[:, :, 0, 2],
             t,
-            latents[:,:,0,2],
+            latents[:, :, 0, 2],
         )
         return self.module.step(
             noise_pred,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 7a2c3eefb..514c73118 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -132,7 +132,9 @@ def check_prepared(
                         vmfbs[submodel] = vmfb
                         if weights[submodel] is None:
                             weights[submodel] = weight
-                    elif weights[submodel] is None and not any(x in submodel for x in ["pipeline", "scheduler"]):
+                    elif weights[submodel] is None and not any(
+                        x in submodel for x in ["pipeline", "scheduler"]
+                    ):
                         _, weight = self.export_submodel(submodel, weights_only=True)
                         weights[submodel] = weight
                 ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
@@ -157,7 +159,7 @@ def is_prepared(self, vmfbs, weights):
                 default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
             elif key == "scheduler":
                 val = None
-                default_filepath=None
+                default_filepath = None
                 continue
             else:
                 val = vmfbs[key]
@@ -494,7 +496,9 @@ def load_pipeline(
                 )
             else:
                 print("\n[LOG] Running scheduler on CPU. This will affect performance.")
-                scheduler = schedulers.get_scheduler(args.hf_model_name, args.scheduler_id)
+                scheduler = schedulers.get_scheduler(
+                    args.hf_model_name, args.scheduler_id
+                )
                 runners["scheduler"] = schedulers.SharkSchedulerCPUWrapper(
                     scheduler,
                     args.batch_size,
@@ -535,7 +539,9 @@ def load_pipeline(
                 ],
             )
             pipe_loaded = time.time()
-            print("\n[LOG] Compiled Pipeline loaded in ", pipe_loaded - load_start, "sec")
+            print(
+                "\n[LOG] Compiled Pipeline loaded in ", pipe_loaded - load_start, "sec"
+            )
 
         else:
             runners["pipe"] = vmfbRunner(
@@ -556,7 +562,9 @@ def load_pipeline(
             runners["vae_decode"] = runners["pipe"]
             runners["prompt_encoder"] = runners["pipe"]
             pipe_loaded = time.time()
-            print("\n[LOG] Compiled Pipeline loaded in ", pipe_loaded - load_start, "sec")
+            print(
+                "\n[LOG] Compiled Pipeline loaded in ", pipe_loaded - load_start, "sec"
+            )
         tok_start = time.time()
         runners["tokenizer_1"] = CLIPTokenizer.from_pretrained(
             self.hf_model_name,
@@ -704,11 +712,17 @@ def generate_images(
             for i in range(batch_count):
                 unet_start = time.time()
                 if self.runners["scheduler"]:
-                    sample, time_ids, steps, timesteps = self.runners["scheduler"].initialize(samples[i])
+                    sample, time_ids, steps, timesteps = self.runners[
+                        "scheduler"
+                    ].initialize(samples[i])
                     iree_inputs = [
                         sample,
-                        ireert.asdevicearray(self.runners["pipe"].config.device, prompt_embeds),
-                        ireert.asdevicearray(self.runners["pipe"].config.device, add_text_embeds),
+                        ireert.asdevicearray(
+                            self.runners["pipe"].config.device, prompt_embeds
+                        ),
+                        ireert.asdevicearray(
+                            self.runners["pipe"].config.device, add_text_embeds
+                        ),
                         time_ids,
                         None,
                     ]
@@ -717,13 +731,19 @@ def generate_images(
                         if self.cpu_scheduling:
                             step_index = s
                         else:
-                            step_index = ireert.asdevicearray(self.runners["scheduler"].runner.config.device, torch.tensor([s]), "int64")
+                            step_index = ireert.asdevicearray(
+                                self.runners["scheduler"].runner.config.device,
+                                torch.tensor([s]),
+                                "int64",
+                            )
                         latents, t = self.runners["scheduler"].scale_model_input(
                             sample,
                             step_index,
                             timesteps,
                         )
-                        noise_pred = self.runners["pipe"].ctx.modules.compiled_unet["run_forward"](
+                        noise_pred = self.runners["pipe"].ctx.modules.compiled_unet[
+                            "run_forward"
+                        ](
                             latents,
                             t,
                             iree_inputs[1],
@@ -738,9 +758,13 @@ def generate_images(
                             step_index,
                         )
                     if isinstance(sample, torch.Tensor):
-                        #TODO: pipe an option for vae_dtype
+                        # TODO: pipe an option for vae_dtype
                         vae_dtype = "float32" if self.precision == "fp32" else "float16"
-                        latents = ireert.asdevicearray(self.runners["vae_decode"].config.device, sample, dtype=vae_dtype)
+                        latents = ireert.asdevicearray(
+                            self.runners["vae_decode"].config.device,
+                            sample,
+                            dtype=vae_dtype,
+                        )
                     else:
                         latents = sample
                 else:
@@ -833,6 +857,7 @@ def numpy_to_pil_image(images):
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
+
     map = empty_pipe_dict
     if args.split_scheduler:
         map["scheduler"] = None
@@ -894,13 +919,15 @@ def numpy_to_pil_image(images):
         args.external_weights_dir,
         args.external_weights,
         args.vae_decomp_attn,
-        custom_vae = None,
-        cpu_scheduling = args.cpu_scheduling,
+        custom_vae=None,
+        cpu_scheduling=args.cpu_scheduling,
     )
     vmfbs, weights = sdxl_pipe.check_prepared(mlirs, vmfbs, weights)
     if args.cpu_scheduling:
         vmfbs["scheduler"] = None
-    sdxl_pipe.load_pipeline(vmfbs, weights, args.rt_device, args.compiled_pipeline, args.split_scheduler)
+    sdxl_pipe.load_pipeline(
+        vmfbs, weights, args.rt_device, args.compiled_pipeline, args.split_scheduler
+    )
     sdxl_pipe.generate_images(
         args.prompt,
         args.negative_prompt,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder_runner.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder_runner.py
index 50c01e964..8f633a6f8 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder_runner.py
@@ -5,58 +5,18 @@
 import numpy as np
 
 
-def run_torch_clip(hf_model_name, hf_auth_token, prompt, max_length=64):
-    # TODO: Integrate with HFTransformerBuilder
-    from turbine_models.custom_models.sdxl_inference.clip import ClipModel
-
-    model_1 = ClipModel(hf_model_name, hf_auth_token, index=1)
-    model_2 = ClipModel(hf_model_name, hf_auth_token, index=2)
-    tokenizer_1 = CLIPTokenizer.from_pretrained(
-        hf_model_name,
-        subfolder="tokenizer",
-        token=hf_auth_token,
-    )
-    tokenizer_2 = CLIPTokenizer.from_pretrained(
-        hf_model_name,
-        subfolder="tokenizer_2",
-        token=hf_auth_token,
-    )
-    text_input_1 = tokenizer_1(
-        prompt,
-        padding="max_length",
-        max_length=max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    text_input_2 = tokenizer_2(
-        prompt,
-        padding="max_length",
-        max_length=max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    example_input_1 = text_input_1.input_ids
-    example_input_2 = text_input_2.input_ids
-
-    results_1 = model_1.forward(example_input_1)
-    results_2 = model_2.forward(example_input_2)
-    np_torch_output_1 = results_1[0].detach().cpu().numpy().astype(np.float16)
-    np_torch_output_2 = results_2[0].detach().cpu().numpy().astype(np.float16)
-    return np_torch_output_1, np_torch_output_2
-
-
 def run_prompt_encoder(
-    args,
+    vmfb_path,
+    device,
+    external_weight_path,
     input_ids,
     uncond_input_ids,
 ):
-    prompt_encoder_runner = vmfbRunner(
-        args.device, args.vmfb_path, args.external_weight_path
-    )
-    np.save("input0.npy", input_ids[0].numpy())
-    np.save("input1.npy", input_ids[1].numpy())
-    np.save("input2.npy", uncond_input_ids[0].numpy())
-    np.save("input3.npy", uncond_input_ids[1].numpy())
+    prompt_encoder_runner = vmfbRunner(device, vmfb_path, external_weight_path)
+    # np.save("input0.npy", input_ids[0].numpy())
+    # np.save("input1.npy", input_ids[1].numpy())
+    # np.save("input2.npy", uncond_input_ids[0].numpy())
+    # np.save("input3.npy", uncond_input_ids[1].numpy())
     prompt_encoder_inputs = [
         ireert.asdevicearray(prompt_encoder_runner.config.device, input_ids[0]),
         ireert.asdevicearray(prompt_encoder_runner.config.device, input_ids[1]),
@@ -66,23 +26,19 @@ def run_prompt_encoder(
     encoded_outputs = prompt_encoder_runner.ctx.modules.compiled_clip["encode_prompts"](
         *prompt_encoder_inputs
     )
+    for i in encoded_outputs:
+        i = i.to_host()
     del prompt_encoder_inputs
     return encoded_outputs
 
 
-if __name__ == "__main__":
-    from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
-
-    tokenizer_1 = CLIPTokenizer.from_pretrained(
-        args.hf_model_name,
-        subfolder="tokenizer",
-        token=args.hf_auth_token,
-    )
-    tokenizer_2 = CLIPTokenizer.from_pretrained(
-        args.hf_model_name,
-        subfolder="tokenizer_2",
-        token=args.hf_auth_token,
-    )
+def run_tokenize(
+    tokenizer_1,
+    tokenizer_2,
+    prompt,
+    negative_prompt,
+    max_length=64,
+):
     text_input_ids_list = []
     uncond_input_ids_list = []
 
@@ -90,16 +46,16 @@ def run_prompt_encoder(
     tokenizers = [tokenizer_1, tokenizer_2]
     for tokenizer in tokenizers:
         text_inputs = tokenizer(
-            args.prompt,
+            prompt,
             padding="max_length",
-            max_length=args.max_length,
+            max_length=max_length,
             truncation=True,
             return_tensors="pt",
         )
         uncond_input = tokenizer(
-            args.negative_prompt,
+            negative_prompt,
             padding="max_length",
-            max_length=args.max_length,
+            max_length=max_length,
             truncation=True,
             return_tensors="pt",
         )
@@ -108,9 +64,34 @@ def run_prompt_encoder(
 
         text_input_ids_list.extend([text_input_ids])
         uncond_input_ids_list.extend([uncond_input_ids])
+    return text_input_ids_list, uncond_input_ids_list
+
 
+if __name__ == "__main__":
+    from turbine_models.custom_models.sdxl_inference.sdxl_cmd_opts import args
+
+    tokenizer_1 = CLIPTokenizer.from_pretrained(
+        args.hf_model_name,
+        subfolder="tokenizer",
+        token=args.hf_auth_token,
+    )
+    tokenizer_2 = CLIPTokenizer.from_pretrained(
+        args.hf_model_name,
+        subfolder="tokenizer_2",
+        token=args.hf_auth_token,
+    )
+
+    text_input_ids_list, uncond_input_ids_list = run_tokenize(
+        tokenizer_1,
+        tokenizer_2,
+        args.prompt,
+        args.negative_prompt,
+        args.max_length,
+    )
     turbine_output1, turbine_output2 = run_prompt_encoder(
-        args,
+        args.vmfb_path,
+        args.rt_device,
+        args.external_weight_path,
         text_input_ids_list,
         uncond_input_ids_list,
     )
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index dd42ce457..701909ae5 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -94,9 +94,7 @@ def export_unet_model(
     weights_only=False,
 ):
     if pipeline_dir:
-        safe_name = os.path.join(
-            pipeline_dir, f"unet"
-        )
+        safe_name = os.path.join(pipeline_dir, f"unet")
     else:
         safe_name = utils.create_safe_name(
             hf_model_name,
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index aab83657c..fa44673ac 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -7,12 +7,16 @@
 import logging
 import pytest
 import torch
+from transformers import CLIPTokenizer
 from turbine_models.custom_models.sd_inference.utils import create_safe_name
+from turbine_models.custom_models.sd_inference import schedulers
 from turbine_models.custom_models.sdxl_inference import (
-    clip,
-    clip_runner,
+    sdxl_prompt_encoder,
+    sdxl_prompt_encoder_runner,
     unet,
     unet_runner,
+    sdxl_scheduled_unet,
+    sdxl_scheduled_unet_runner,
     vae,
     vae_runner,
     sdxl_compiled_pipeline,
@@ -92,127 +96,83 @@ def setUp(self):
             ),
         )
 
-    def test01_ExportClipModels(self):
+    def test01_ExportPromptEncoder(self):
         if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest(
                 "Compilation error on vulkan; Runtime error on rocm; To be tested on cuda."
             )
-        clip.export_clip_model(
-            # This is a public model, so no auth required
-            hf_model_name=arguments["hf_model_name"],
-            hf_auth_token=None,
-            max_length=arguments["max_length"],
-            precision=arguments["precision"],
-            compile_to="vmfb",
-            external_weights=arguments["external_weights"],
-            external_weight_path=self.safe_model_name
-            + "_"
-            + arguments["precision"]
-            + "_clip",
-            device=arguments["device"],
-            target_triple=arguments["iree_target_triple"],
-            ireec_flags=arguments["ireec_flags"],
-            index=1,
-            exit_on_vmfb=True,
-        )
-        clip.export_clip_model(
-            hf_model_name=arguments["hf_model_name"],
-            hf_auth_token=None,  # This is a public model, so no auth required
-            max_length=arguments["max_length"],
-            precision=arguments["precision"],
-            compile_to="vmfb",
-            external_weights=arguments["external_weights"],
-            external_weight_path=self.safe_model_name
-            + "_"
-            + arguments["precision"]
-            + "_clip",
-            device=arguments["device"],
-            target_triple=arguments["iree_target_triple"],
-            ireec_flags=arguments["ireec_flags"],
-            index=2,
-            exit_on_vmfb=True,
-        )
-        arguments["external_weight_path_1"] = (
-            self.safe_model_name
-            + "_"
-            + arguments["precision"]
-            + "_clip_1."
-            + arguments["external_weights"]
-        )
-        arguments["external_weight_path_2"] = (
-            self.safe_model_name
-            + "_"
-            + arguments["precision"]
-            + "_clip_2."
-            + arguments["external_weights"]
+        arguments["external_weight_path"] = (
+            "prompt_encoder." + arguments["external_weights"]
         )
-        arguments["vmfb_path_1"] = (
-            self.safe_model_name
-            + "_"
-            + str(arguments["max_length"])
-            + "_"
-            + arguments["precision"]
-            + "_clip_1_"
-            + arguments["device"]
-            + ".vmfb"
+        _, prompt_encoder_vmfb = sdxl_prompt_encoder.export_prompt_encoder(
+            arguments["hf_model_name"],
+            None,
+            arguments["max_length"],
+            arguments["precision"],
+            "vmfb",
+            "safetensors",
+            arguments["external_weight_path"],
+            arguments["device"],
+            arguments["iree_target_triple"],
+            arguments["ireec_flags"],
+            False,
+            None,
+            None,
+            arguments["attn_spec"],
+            False,
+            arguments["batch_size"],
         )
-        arguments["vmfb_path_2"] = (
-            self.safe_model_name
-            + "_"
-            + str(arguments["max_length"])
-            + "_"
-            + arguments["precision"]
-            + "_clip_2_"
-            + arguments["device"]
-            + ".vmfb"
+        tokenizer_1 = CLIPTokenizer.from_pretrained(
+            arguments["hf_model_name"],
+            subfolder="tokenizer",
+            token=arguments["hf_auth_token"],
         )
-        turbine_1 = clip_runner.run_clip(
-            arguments["rt_device"],
-            arguments["prompt"],
-            arguments["vmfb_path_1"],
+        tokenizer_2 = CLIPTokenizer.from_pretrained(
             arguments["hf_model_name"],
-            arguments["hf_auth_token"],
-            arguments["external_weight_path_1"],
+            subfolder="tokenizer_2",
+            token=arguments["hf_auth_token"],
+        )
+        (
+            text_input_ids_list,
+            uncond_input_ids_list,
+        ) = sdxl_prompt_encoder_runner.run_tokenize(
+            tokenizer_1,
+            tokenizer_2,
+            arguments["prompt"],
+            arguments["negative_prompt"],
             arguments["max_length"],
-            index=1,
         )
-        turbine_2 = clip_runner.run_clip(
+        (
+            turbine_output1,
+            turbine_output2,
+        ) = sdxl_prompt_encoder_runner.run_prompt_encoder(
+            prompt_encoder_vmfb,
             arguments["rt_device"],
-            arguments["prompt"],
-            arguments["vmfb_path_2"],
-            arguments["hf_model_name"],
-            arguments["hf_auth_token"],
-            arguments["external_weight_path_2"],
-            arguments["max_length"],
-            index=2,
+            arguments["external_weight_path"],
+            text_input_ids_list,
+            uncond_input_ids_list,
         )
-        torch_output_1, torch_output_2 = clip_runner.run_torch_clip(
+        torch_model = sdxl_prompt_encoder.PromptEncoderModule(
             arguments["hf_model_name"],
+            arguments["precision"],
             arguments["hf_auth_token"],
-            arguments["prompt"],
-            arguments["max_length"],
+        )
+        torch_output1, torch_output2 = torch_model.forward(
+            *text_input_ids_list, *uncond_input_ids_list
         )
         if arguments["benchmark"] or arguments["tracy_profile"]:
             run_benchmark(
-                "clip_1",
-                arguments["vmfb_path_1"],
-                arguments["external_weight_path_1"],
-                arguments["rt_device"],
-                max_length=arguments["max_length"],
-                tracy_profile=arguments["tracy_profile"],
-            )
-            run_benchmark(
-                "clip_2",
-                arguments["vmfb_path_2"],
-                arguments["external_weight_path_2"],
+                "prompt_encoder",
+                prompt_encoder_vmfb,
+                arguments["external_weight_path"],
                 arguments["rt_device"],
                 max_length=arguments["max_length"],
                 tracy_profile=arguments["tracy_profile"],
             )
         rtol = 4e-1
         atol = 4e-1
-        np.testing.assert_allclose(torch_output_1, turbine_1[0], rtol, atol)
-        np.testing.assert_allclose(torch_output_2, turbine_2[0], rtol, atol)
+        np.testing.assert_allclose(torch_output1, turbine_output1, rtol, atol)
+        np.testing.assert_allclose(torch_output2, turbine_output2, rtol, atol)
 
     def test02_ExportUnetModel(self):
         if arguments["device"] in ["vulkan", "cuda"]:
diff --git a/models/turbine_models/utils/sdxl_benchmark.py b/models/turbine_models/utils/sdxl_benchmark.py
index 1c37f93a1..decc2d940 100644
--- a/models/turbine_models/utils/sdxl_benchmark.py
+++ b/models/turbine_models/utils/sdxl_benchmark.py
@@ -41,6 +41,8 @@ def run_benchmark(
             inputs.append(f"1x{max_length}xi64")
         case "clip_2":
             inputs.append(f"1x{max_length}xi64")
+        case "prompt_encoder":
+            inputs.extend([f"1x{max_length}xi64"] * 4)
         case "unet":
             inputs.append(
                 f"{batch_size}x{in_channels}x{height//8}x{width//8}x{DTYPE_MAP[precision]}"

From 9798fd7f03691ce9d1f4e06ad7305e72b2feaca6 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 13 Jun 2024 02:29:37 -0500
Subject: [PATCH 118/174] SD3 text encoding and diffusion modeling.

---
 .gitignore                                    |   5 +-
 .../sd3_inference/sd3_cmd_opts.py             | 345 +++++++++++
 .../sd3_inference/sd3_mmdit_runner.py         | 115 ++++
 .../sd3_inference/sd3_schedulers.py           | 322 +++++++++++
 .../sd3_inference/sd3_text_encoders.py        | 227 ++++++++
 .../sd3_inference/sd3_text_encoders_runner.py | 116 ++++
 .../sd3_inference/text_encoder_impls.py       | 537 ++++++++++++++++++
 .../sd3_inference/turbine_mmdit.py            | 217 +++++++
 .../custom_models/sd_inference/utils.py       |  11 +-
 9 files changed, 1893 insertions(+), 2 deletions(-)
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/turbine_mmdit.py

diff --git a/.gitignore b/.gitignore
index f5fe49941..54f4c40cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,4 +28,7 @@ wheelhouse
 *.safetensors
 *.gguf
 *.vmfb
-*.mlir
\ No newline at end of file
+*.mlir
+*.npy
+*.png
+*tmp*
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
new file mode 100644
index 000000000..535135daa
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -0,0 +1,345 @@
+import argparse
+import os
+from pathlib import Path
+
+
+def path_expand(s):
+    return Path(s).expanduser().resolve()
+
+
+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
+# Note: this is where command-line options for the scripts in this directory
+# are defined along with their defaults. Thus, they should not be referenced
+# within modelling or inference code, only at the entry point to the script.
+
+# We should consider separating out the options that are "model configs" from
+# the options that control the compiler, runtime, and script behavior,
+# when applicable, as the formermost would best be kept in a separate
+# config or imported from huggingface.
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+# SD3 Source Options
+##############################################################################
+
+p.add_argument(
+    "--hf_auth_token",
+    type=str,
+    help="The Hugging Face auth token, if required",
+    default=None,
+)
+p.add_argument(
+    "--hf_model_name",
+    type=str,
+    help="HF model name",
+    default="stabilityai/stable-diffusion-3-medium-diffusers",
+)
+p.add_argument(
+    "--scheduler_id",
+    type=str,
+    help="Scheduler ID",
+    default="EulerDiscrete",
+)
+p.add_argument(
+    "--model_path",
+    type=str,
+    help="Path to model .safetensors from which the model is defined.",
+    default=None,
+)
+p.add_argument(
+    "--vae_model_path",
+    type=str,
+    help="Path to vae model .safetensors from which the model is defined.",
+    default=None,
+)
+
+##############################################################################
+# SD3 Inference Options
+#    These options are used to control runtime parameters for SD3 inference.
+##############################################################################
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default=" a cat under the snow with blue eyes, covered by snow, cinematic style, medium shot, professional photo, animal",
+    help="Prompt input to stable diffusion.",
+)
+
+p.add_argument(
+    "--negative_prompt",
+    type=str,
+    default="Watermark, blurry, oversaturated, low resolution, pollution",
+    help="Negative prompt input to stable diffusion.",
+)
+
+p.add_argument(
+    "--num_inference_steps", type=int, default=30, help="Number of UNet inference steps"
+)
+
+p.add_argument(
+    "--batch_count",
+    type=int,
+    default=1,
+    help="Number of batches to run for a single prompt",
+)
+
+p.add_argument(
+    "--guidance_scale",
+    type=float,
+    default=7.5,
+    help="Scale by which to adjust prompt guidance to the unconditional noise prediction output of UNet after each iteration.",
+)
+
+p.add_argument(
+    "--seed", type=float, default=0, help="Seed for random number/latents generation."
+)
+
+p.add_argument(
+    "--denoise",
+    type=float,
+    default=1.0,
+    help="Denoising factor for image to image",
+)
+
+p.add_argument(
+    "--external_weight_path",
+    type=str,
+    default="",
+    help="Path to external weights file, for jobs with one weights filepath. When importing, this is used to specify where to save the model weights, and at runtime, this is used to specify where to load the model weights from.",
+)
+
+p.add_argument(
+    "--external_weights_dir",
+    type=str,
+    default="",
+    help="Directory containing external weights for a job that requires more than one weights file. When importing, this is used to specify where to save the model weights, and at runtime, this is used to specify where to load the model weights from. Files will then be saved according to the parameters that make them unique, i.e. <hf_model_name>_<precision>_<submodel>_<submodel-specific>.<external_weights>",
+)
+
+p.add_argument(
+    "--vmfb_path", type=str, default="", help="path to vmfb containing compiled module"
+)
+
+p.add_argument(
+    "--pipeline_vmfb_path",
+    type=str,
+    default="",
+    help="path to vmfb containing compiled meta-module",
+)
+
+p.add_argument(
+    "--scheduler_vmfb_path",
+    type=str,
+    default="",
+    help="path to vmfb containing compiled scheduler",
+)
+
+p.add_argument(
+    "--split_scheduler",
+    default=False,
+    action="store_true",
+    help="Use a decoupled unet and scheduler for better QOL.",
+)
+
+p.add_argument(
+    "--cpu_scheduling",
+    default=False,
+    action="store_true",
+    help="Run scheduling on torch cpu (will be slower due to data movement costs).",
+)
+
+p.add_argument(
+    "--external_weight_file",
+    type=str,
+    default=None,
+    help="Path to external weights, used in benchmark scripts.",
+)
+
+p.add_argument(
+    "--pipeline_dir",
+    type=str,
+    default=None,
+    help="Directory to save pipeline artifacts",
+)
+
+p.add_argument(
+    "--compiled_pipeline",
+    default=False,
+    action="store_true",
+    help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
+)
+
+##############################################################################
+# SD3 Modelling Options
+#    These options are used to control model defining parameters for SD3.
+#    These are MLIR - changing variables! If you change them, you will need
+#    to import/download and recompile the model.
+##############################################################################
+
+p.add_argument("--batch_size", type=int, default=1, help="Batch size for inference")
+p.add_argument(
+    "--height", type=int, default=1024, help="Height of Stable Diffusion output image."
+)
+p.add_argument(
+    "--width", type=int, default=1024, help="Width of Stable Diffusion output image"
+)
+p.add_argument(
+    "--precision",
+    type=str,
+    default="fp16",
+    help="Precision of Stable Diffusion weights and graph.",
+)
+p.add_argument(
+    "--max_length", type=int, default=77, help="Sequence Length of Stable Diffusion"
+)
+p.add_argument("--vae_variant", type=str, default="decode", help="encode, decode")
+p.add_argument(
+    "--shift", type=float, default=3, help="Sampling shift value for sd3 scheduling"
+)
+p.add_argument(
+    "--vae_decomp_attn",
+    type=bool,
+    default=False,
+    help="Decompose attention for VAE decode only at fx graph level",
+)
+
+##############################################################################
+# SD3 script general options.
+##############################################################################
+
+p.add_argument("--compile_to", type=str, default="mlir", help="torch, linalg, vmfb")
+p.add_argument(
+    "--init_image",
+    type=str,
+    default=None,
+    help="Path to initial image for inference",
+)
+p.add_argument(
+    "--external_weights",
+    type=str,
+    default=None,
+    choices=["safetensors", "irpa", "gguf", None],
+    help="Externalizes model weights from the torch dialect IR and its successors",
+)
+
+# See --external_weight_path and --external_weight_dir to specify where to save the model weights.
+p.add_argument(
+    "--weights_only",
+    action="store_true",
+    help="Just grab the weights for your model and exit instead of exporting any IR.",
+)
+p.add_argument(
+    "--compare_vs_torch",
+    action="store_true",
+    help="Runs both turbine vmfb and a torch model to compare results",
+)
+p.add_argument(
+    "--decomp_attn",
+    default=False,
+    action="store_true",
+    help="Decompose attention at fx graph level",
+)
+p.add_argument(
+    "--exit_on_vmfb",
+    default=True,
+    action="store_false",
+    help="Exit program on vmfb compilation completion. Most scripts will also save .mlir if this is disabled.",
+)
+p.add_argument(
+    "--input_mlir",
+    type=str,
+    default=None,
+    help="Path to input mlir file to compile. Comma-separate paths to provide more than one input to pipelines.",
+)
+p.add_argument(
+    "--download_mlir",
+    default=False,
+    action="store_true",
+    help="Download missing mlir files from Azure storage.",
+)
+p.add_argument(
+    "--container_name",
+    type=str,
+    default=None,
+    help="Azure storage container name to download mlir files from.",
+)
+p.add_argument(
+    "--export",
+    type=str,
+    default="all",
+    help="clip, mmdit, vae, all")
+p.add_argument(
+    "--output",
+    type=str,
+    default="SD3_output.png",
+    help="Path to output file for generated images.",
+)
+
+##############################################################################
+# IREE Compiler Options
+##############################################################################
+
+p.add_argument("--device", type=str, default="cpu", help="cpu, cuda, vulkan, rocm")
+
+p.add_argument(
+    "--rt_device",
+    type=str,
+    default="local-task",
+    help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
+)
+
+# TODO: Bring in detection for target triple
+p.add_argument(
+    "--iree_target_triple",
+    type=str,
+    default="",
+    help="Specify vulkan target triple or rocm/cuda target device.",
+)
+
+p.add_argument("--ireec_flags", type=str, default="", help="extra iree-compile options")
+
+p.add_argument(
+    "--attn_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options for models with iree_linalg_ext.attention ops.",
+)
+
+p.add_argument(
+    "--attn_spec",
+    type=str,
+    default=None,
+    help="extra iree-compile options for models with iree_linalg_ext.attention ops. Set this to 'default' if you are using mfma-capable hardware with ROCM.",
+)
+
+p.add_argument(
+    "--clip_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options to send for compiling CLIP/prompt_encoder. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
+)
+
+p.add_argument(
+    "--vae_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options to send for compiling VAE. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
+)
+
+p.add_argument(
+    "--unet_flags",
+    type=str,
+    default="",
+    help="extra iree-compile options to send for compiling unet. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
+)
+
+
+args, unknown = p.parse_known_args()
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
new file mode 100644
index 000000000..fe3ae2b4e
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
@@ -0,0 +1,115 @@
+import argparse
+from turbine_models.model_runner import vmfbRunner
+from turbine_models.custom_models.sd_inference import utils, schedulers
+from iree import runtime as ireert
+import torch
+import numpy as np
+from tqdm.auto import tqdm
+from shark_turbine.ops.iree import trace_tensor
+
+torch.random.manual_seed(0)
+
+
+def run_mmdit_turbine(
+    hidden_states,
+    encoder_hidden_states,
+    pooled_projections,
+    timestep,
+    lora_scale,
+    args,
+):
+    torch_dtype = torch.float16 if args.precision == "fp16" else torch.float32
+    mmdit_runner = vmfbRunner(
+        args.device,
+        args.vmfb_path,
+        args.external_weight_path,
+    )
+    iree_inputs = [
+        ireert.asdevicearray(mmdit_runner.config.device, hidden_states),
+        ireert.asdevicearray(mmdit_runner.config.device, encoder_hidden_states),
+        ireert.asdevicearray(mmdit_runner.config.device, pooled_projections),
+        ireert.asdevicearray(mmdit_runner.config.device, timestep),
+        ireert.asdevicearray(mmdit_runner.config.device, lora_scale),
+    ]
+    noise_pred = mmdit_runner.ctx.modules.compiled_mmdit["run_forward"](*iree_inputs).to_host()
+    return noise_pred
+
+
+@torch.no_grad()
+def run_diffusers_mmdit(
+    hidden_states,
+    encoder_hidden_states,
+    pooled_projections,
+    timestep,
+    lora_scale,
+    args,
+):
+    from turbine_models.custom_models.sd3_inference.turbine_mmdit import MMDiTModel
+    mmdit_model = MMDiTModel(
+        args.hf_model_name,
+        dtype=torch.float32,
+    )
+    noise_pred = mmdit_model.forward(
+        hidden_states.float(), encoder_hidden_states.float(), pooled_projections.float(), timestep.float(), lora_scale.float()
+    )
+
+    return noise_pred.numpy()
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+    import numpy as np
+    import os
+
+    torch.random.manual_seed(0)
+
+    if args.precision == "fp16":
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    hidden_states = torch.randn(
+        (args.batch_size, 16, args.height // 8, args.width // 8), dtype=dtype
+    )
+    encoder_hidden_states = torch.randn(
+        (args.batch_size, args.max_length, 4096), dtype=dtype
+    )
+    pooled_projections = torch.randn((args.batch_size, 2048), dtype=dtype)
+    timestep = torch.tensor([0], dtype=dtype)
+    lora_scale = torch.tensor([1.0], dtype=dtype)
+
+    turbine_output = run_mmdit_turbine(
+        hidden_states,
+        encoder_hidden_states,
+        pooled_projections,
+        timestep,
+        lora_scale,
+        args,
+    )
+    print(
+        "TURBINE SPLIT OUTPUT:",
+        turbine_output,
+        turbine_output.shape,
+        turbine_output.dtype,
+    )
+    turbine_output = turbine_output
+
+    if args.compare_vs_torch:
+        print("generating torch output: ")
+        torch_output = run_diffusers_mmdit(
+            hidden_states,
+            encoder_hidden_states,
+            pooled_projections,
+            timestep,
+            lora_scale,
+            args,
+        )
+        print("torch OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
+
+        print("\n(torch (comfy) image latents to iree image latents): ")
+
+        np.testing.assert_allclose(
+            turbine_output, torch_output, rtol=4e-2, atol=4e-2
+        )
+        print("passed!")
+
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
new file mode 100644
index 000000000..87492a701
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -0,0 +1,322 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+from typing import List
+
+import torch
+from shark_turbine.aot import *
+import shark_turbine.ops.iree as ops
+from iree.compiler.ir import Context
+import iree.runtime as ireert
+import numpy as np
+
+from diffusers import (
+    FlowMatchEulerDiscreteScheduler,
+)
+
+from turbine_models.turbine_tank import turbine_tank
+from turbine_models.custom_models.sd_inference import utils
+from turbine_models.model_runner import vmfbRunner
+
+
+class SharkSchedulerWrapper:
+    def __init__(self, rt_device, vmfb):
+        self.runner = vmfbRunner(rt_device, vmfb, None)
+
+    def initialize(self, sample):
+        sample, time_ids, steps, timesteps = self.runner.ctx.modules.compiled_scheduler[
+            "run_init"
+        ](sample)
+        return sample, steps.to_host(), timesteps
+
+    def prepare_model_input(self, sample, t, timesteps):
+        return self.runner.ctx.modules.compiled_scheduler["run_prep"](
+            sample, t, timesteps
+        )
+
+    def step(self, noise_pred, t, sample, guidance_scale, step_index):
+        return self.runner.ctx.modules.compiled_scheduler["run_step"](
+            noise_pred, t, sample, guidance_scale, step_index
+        )
+
+
+class FlowSchedulingModel(torch.nn.Module):
+    def __init__(
+        self,
+        hf_model_name,
+        num_inference_steps,
+        dtype,
+    ):
+        super().__init__()
+        # For now, assumes SDXL implementation. May not need parametrization for other models,
+        # but keeping hf_model_name in case.
+        self.model = FlowMatchEulerDiscreteScheduler.from_pretrained(hf_model_name, subfolder="scheduler")
+        self.do_classifier_free_guidance = True
+        self.model.set_timesteps(num_inference_steps)
+        self.timesteps = self.model.timesteps
+        self.dtype = dtype
+
+    # TODO: Make steps dynamic here
+    def initialize(self, sample):
+        step_count = torch.tensor(len(self.timesteps))
+        timesteps = self.model.timesteps
+        # ops.trace_tensor("timesteps", self.timesteps)
+        return (
+            sample.type(self.dtype),
+            step_count,
+            timesteps.type(torch.float32),
+        )
+
+    def prepare_model_input(self, sample, t, timesteps):
+        t = timesteps[t]
+        t = t.expand(sample.shape[0])
+        if self.do_classifier_free_guidance:
+            latent_model_input = torch.cat([sample] * 2)
+        else:
+            latent_model_input = sample
+        return latent_model_input.type(self.dtype), t.type(self.dtype)
+
+    def step(self, noise_pred, t, sample, guidance_scale, i):
+        self.model._step_index = i
+
+        if self.do_classifier_free_guidance:
+            noise_preds = noise_pred.chunk(2)
+            noise_pred = noise_preds[0] + guidance_scale * (
+                noise_preds[1] - noise_preds[0]
+            )
+        sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
+        return sample.type(self.dtype)
+
+
+class SharkSchedulerCPUWrapper:
+    @torch.no_grad()
+    def __init__(
+        self, scheduler, batch_size, num_inference_steps, dest_device, latents_dtype
+    ):
+        self.do_classifier_free_guidance = True
+        self.module = scheduler
+        self.dest = dest_device
+        self.dtype = latents_dtype
+        self.batch_size = batch_size
+        self.module.set_timesteps(num_inference_steps)
+        self.timesteps = self.module.timesteps
+        self.torch_dtype = (
+            torch.float32 if latents_dtype == "float32" else torch.float16
+        )
+
+    def initialize(self, sample):
+        if isinstance(sample, ireert.DeviceArray):
+            sample = torch.tensor(sample.to_host(), dtype=torch.float32)
+        step_indexes = torch.tensor(len(self.module.timesteps))
+        timesteps = self.timesteps
+        return sample, step_indexes, timesteps
+
+    def scale_model_input(self, sample, t, timesteps):
+        if self.do_classifier_free_guidance:
+            sample = torch.cat([sample] * 2)
+        t = timesteps[t]
+        t = t.expand(sample.shape[0])
+        t = ireert.asdevicearray(self.dest, [t], self.dtype)
+        sample = ireert.asdevicearray(self.dest, sample, self.dtype)
+        return sample, t
+
+    def step(self, noise_pred, t, latents, guidance_scale, i):
+        if isinstance(t, ireert.DeviceArray):
+            t = torch.tensor(t.to_host())
+        if isinstance(guidance_scale, ireert.DeviceArray):
+            guidance_scale = torch.tensor(guidance_scale.to_host())
+        noise_pred = torch.tensor(noise_pred.to_host())
+        if self.do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        return self.module.step(
+            noise_pred,
+            t,
+            latents,
+            return_dict=False,
+        )[0]
+
+
+@torch.no_grad()
+def export_scheduler_model(
+    hf_model_name: str,
+    batch_size: int = 1,
+    height: int = 512,
+    width: int = 512,
+    num_inference_steps: int = 30,
+    precision: str = "fp16",
+    compile_to: str = "torch",
+    device: str = None,
+    target_triple: str = None,
+    ireec_flags: str = None,
+    exit_on_vmfb: bool = False,
+    pipeline_dir: str = None,
+    input_mlir: str = None,
+    upload_ir=False,
+):
+    dtype = torch.float16 if precision == "fp16" else torch.float32
+    scheduler_module = FlowSchedulingModel(
+        hf_model_name, num_inference_steps, dtype
+    )
+    if pipeline_dir:
+        vmfb_names = [
+            "EulerFlowScheduler",
+            str(num_inference_steps),
+        ]
+        vmfb_name = "_".join(vmfb_names)
+        safe_name = os.path.join(pipeline_dir, vmfb_name)
+    else:
+        vmfb_names = [
+            "EulerFlowScheduler",
+            f"bs{args.batch_size}_{args.height}x{args.width}",
+            precision,
+            str(num_inference_steps),
+            target_triple,
+        ]
+        vmfb_name = "_".join(vmfb_names)
+        safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
+
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+        )
+        return vmfb_path
+
+    do_classifier_free_guidance = True
+    if do_classifier_free_guidance:
+        init_batch_dim = 2
+    else:
+        init_batch_dim = 1
+
+    sample = (
+        batch_size,
+        16,
+        height // 8,
+        width // 8,
+    )
+    noise_pred_shape = (
+        batch_size * init_batch_dim,
+        16,
+        height // 8,
+        width // 8,
+    )
+    example_init_args = [torch.empty(sample, dtype=dtype)]
+    example_prep_args = (
+        torch.empty(sample, dtype=dtype),
+        torch.empty(1, dtype=torch.int64),
+        torch.empty([19], dtype=torch.float32),
+    )
+    timesteps = torch.export.Dim("timesteps")
+    prep_dynamic_args = {
+        "sample": {},
+        "t": {},
+        "timesteps": {0: timesteps},
+    }
+    example_step_args = [
+        torch.empty(noise_pred_shape, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(sample, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(1, dtype=torch.int64),
+    ]
+
+    fxb = FxProgramsBuilder(scheduler_module)
+
+    @fxb.export_program(
+        args=(example_init_args,),
+    )
+    def _initialize(module, sample):
+        return module.initialize(*sample)
+
+    @fxb.export_program(
+        args=example_prep_args,
+        dynamic_shapes=prep_dynamic_args,
+    )
+    def _prep(module, sample, t, timesteps):
+        return module.prepare_model_input(sample, t, timesteps)
+
+    @fxb.export_program(
+        args=(example_step_args,),
+    )
+    def _step(module, inputs):
+        return module.step(*inputs)
+
+    decomp_list = []
+    # if decomp_attn == True:
+    #     decomp_list.extend(
+    #         [
+    #             torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+    #             torch.ops.aten._scaled_dot_product_flash_attention.default,
+    #         ]
+    #     )
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+
+        class CompiledScheduler(CompiledModule):
+            run_init = _initialize
+            run_prep = _prep
+            run_step = _step
+
+    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
+    inst = CompiledScheduler(context=Context(), import_to=import_to)
+
+    module_str = str(CompiledModule.get_mlir_module(inst))
+
+    if compile_to != "vmfb":
+        return module_str
+    elif compile_to == "vmfb":
+        vmfb = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=True,
+        )
+        if exit_on_vmfb:
+            exit()
+        return vmfb
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    mod_str = export_scheduler_model(
+        args.hf_model_name,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.num_inference_steps,
+        args.precision,
+        args.compile_to,
+        args.device,
+        args.iree_target_triple,
+        args.ireec_flags,
+        exit_on_vmfb=False,
+        input_mlir=args.input_mlir,
+    )
+    vmfb_names = [
+        "EulerFlowScheduler",
+        f"bs{args.batch_size}_{args.height}x{args.width}",
+        args.precision,
+        str(args.num_inference_steps),
+        args.iree_target_triple,
+    ]
+    safe_name = "_".join(vmfb_names)
+    if args.compile_to != "vmfb":
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
new file mode 100644
index 000000000..895f27bf7
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -0,0 +1,227 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import sys
+
+import safetensors
+from iree import runtime as ireert
+import iree.compiler as ireec
+from iree.compiler.ir import Context
+import numpy as np
+from shark_turbine.aot import *
+from turbine_models.custom_models.sd_inference import utils
+import torch
+from turbine_models.custom_models.sd3_inference.text_encoder_impls import SDClipModel, SDXLClipG, T5XXLModel, load_into
+from huggingface_hub import hf_hub_download
+from safetensors import safe_open
+
+CLIPG_CONFIG = {
+    "hidden_act": "gelu",
+    "hidden_size": 1280,
+    "intermediate_size": 5120,
+    "num_attention_heads": 20,
+    "num_hidden_layers": 32
+}
+
+CLIPL_CONFIG = {
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12
+}
+
+T5_CONFIG = {
+    "d_ff": 10240,
+    "d_model": 4096,
+    "num_heads": 64,
+    "num_layers": 24,
+    "vocab_size": 32128
+}
+
+class TextEncoderModule(torch.nn.Module):
+    @torch.no_grad()
+    def __init__(
+        self,
+        batch_size=1,
+    ):
+        super().__init__()
+        self.dtype = torch.float16
+        self.clip_l = SDClipModel(
+            layer="hidden",
+            layer_idx=-2,
+            device="cpu",
+            dtype=self.dtype,
+            layer_norm_hidden_state=False,
+            return_projected_pooled=False,
+            textmodel_json_config=CLIPL_CONFIG
+        ).half()
+        clip_l_weights = hf_hub_download(
+            repo_id="stabilityai/stable-diffusion-3-medium",
+            filename="text_encoders/clip_l.safetensors"
+        )
+        with safe_open(clip_l_weights, framework="pt", device="cpu") as f:
+            load_into(f, self.clip_l.transformer, "", "cpu", self.dtype)
+        self.clip_g = SDXLClipG(CLIPG_CONFIG, device="cpu", dtype=self.dtype).half()
+        clip_g_weights = hf_hub_download(
+            repo_id="stabilityai/stable-diffusion-3-medium",
+            filename="text_encoders/clip_g.safetensors"
+        )
+        with safe_open(clip_g_weights, framework="pt", device="cpu") as f:
+            load_into(f, self.clip_g.transformer, "", "cpu", self.dtype)
+        self.t5xxl = T5XXLModel(T5_CONFIG, device="cpu", dtype=self.dtype).half()
+        t5_weights = hf_hub_download(
+            repo_id="stabilityai/stable-diffusion-3-medium",
+            filename="text_encoders/t5xxl_fp16.safetensors"
+        )
+        with safe_open(t5_weights, framework="pt", device="cpu") as f:
+            load_into(f, self.t5xxl.transformer, "", "cpu", self.dtype)
+
+        self.do_classifier_free_guidance = True
+        self.batch_size = batch_size
+
+    def get_cond(self, tokens_l, tokens_g, tokens_t5xxl):
+        l_out, l_pooled = self.clip_l.forward(tokens_l)
+        g_out, g_pooled = self.clip_g.forward(tokens_g)
+        t5_out, _ = self.t5xxl.forward(tokens_t5xxl)
+        lg_out = torch.cat([l_out, g_out], dim=-1)
+        lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
+        return torch.cat([lg_out, t5_out], dim=-2), torch.cat((l_pooled, g_pooled), dim=-1)
+
+    def forward(self, tokens_g, tokens_l, tokens_t5xxl, neg_g, neg_l, neg_t5):
+        conditioning, cond_pool = self.get_cond(tokens_l, tokens_g, tokens_t5xxl)
+        neg_cond, neg_cond_pool = self.get_cond(neg_l, neg_g, neg_t5)
+
+        prompt_embeds = torch.cat([neg_cond, conditioning], dim=0)
+        pooled_prompt_embeds = torch.cat([cond_pool, neg_cond_pool], dim=0)
+
+        return prompt_embeds, pooled_prompt_embeds
+
+@torch.no_grad()
+def export_text_encoder(
+    hf_model_name,
+    hf_auth_token=None,
+    max_length=64,
+    precision="fp16",
+    compile_to="torch",
+    external_weights=None,
+    external_weight_path=None,
+    device=None,
+    target_triple=None,
+    ireec_flags=None,
+    exit_on_vmfb=True,
+    pipeline_dir=None,
+    input_mlir=None,
+    attn_spec=None,
+    output_batchsize=1,
+    decomp_attn=True,
+):
+    if pipeline_dir not in [None, ""]:
+        safe_name = os.path.join(pipeline_dir, "text_encoders")
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name, f"_{str(max_length)}_{precision}_text_encoders-{device}"
+        )
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+            const_expr_hoisting=True,
+            attn_spec=attn_spec,
+        )
+        return vmfb_path
+    model = TextEncoderModule(
+        batch_size=output_batchsize,
+    )
+    mapper = {}
+
+    assert ".safetensors" not in external_weight_path, "Original parameters format incompatible with IREE safetensors parser. Use '.irpa' instead."
+    
+    input_args = [torch.empty([1,77,2], dtype=torch.int64) for x in range(6)]
+
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+        fxb = FxProgramsBuilder(model)
+
+        @fxb.export_program(
+            args=(input_args,),
+        )
+        def _forward(
+            module,
+            inputs,
+        ):
+            return module.forward(*inputs)
+
+        class CompiledTextEncoder(CompiledModule):
+            encode_tokens = _forward
+
+        if external_weights:
+            externalize_module_parameters(model)
+            save_module_parameters(external_weight_path, model)
+
+        inst = CompiledTextEncoder(context=Context(), import_to="IMPORT")
+
+        module_str = str(CompiledModule.get_mlir_module(inst))
+
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        vmfb_path = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=not exit_on_vmfb,
+            const_expr_hoisting=True,
+            attn_spec=attn_spec,
+        )
+        return module_str, vmfb_path
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    mod_str, _ = export_text_encoder(
+        args.hf_model_name,
+        args.hf_auth_token,
+        args.max_length,
+        args.precision,
+        args.compile_to,
+        args.external_weights,
+        args.external_weight_path,
+        args.device,
+        args.iree_target_triple,
+        args.ireec_flags + args.clip_flags,
+        exit_on_vmfb=True,
+        pipeline_dir=args.pipeline_dir,
+        input_mlir=args.input_mlir,
+        attn_spec=args.attn_spec,
+        output_batchsize=args.batch_size,
+    )
+    if args.input_mlir or args.weights_only or args.compile_to=="vmfb":
+        exit()
+    safe_name = utils.create_safe_name(
+        args.hf_model_name, f"_{str(args.max_length)}_{args.precision}_text_encoders"
+    )
+    with open(f"{safe_name}.mlir", "w+") as f:
+        f.write(mod_str)
+    print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
new file mode 100644
index 000000000..1093f4b27
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
@@ -0,0 +1,116 @@
+from turbine_models.model_runner import vmfbRunner
+from text_encoder_impls import SD3Tokenizer, T5XXLTokenizer, SDXLClipGTokenizer
+from iree import runtime as ireert
+import torch
+import numpy as np
+
+
+def run_prompt_encoder(
+    vmfb_path,
+    device,
+    external_weight_path,
+    input_ids,
+    uncond_input_ids,
+):
+    prompt_encoder_runner = vmfbRunner(device, vmfb_path, external_weight_path)
+    # np.save("input0.npy", input_ids[0].numpy())
+    # np.save("input1.npy", input_ids[1].numpy())
+    # np.save("input2.npy", input_ids[2].numpy())
+    # np.save("input3.npy", uncond_input_ids[0].numpy())
+    # np.save("input4.npy", uncond_input_ids[1].numpy())
+    # np.save("input5.npy", uncond_input_ids[2].numpy())
+    prompt_encoder_inputs = [
+        ireert.asdevicearray(prompt_encoder_runner.config.device, input_ids[0]),
+        ireert.asdevicearray(prompt_encoder_runner.config.device, input_ids[1]),
+        ireert.asdevicearray(prompt_encoder_runner.config.device, input_ids[2]),
+        ireert.asdevicearray(prompt_encoder_runner.config.device, uncond_input_ids[0]),
+        ireert.asdevicearray(prompt_encoder_runner.config.device, uncond_input_ids[1]),
+        ireert.asdevicearray(prompt_encoder_runner.config.device, uncond_input_ids[2]),
+
+    ]
+    encoded_outputs = prompt_encoder_runner.ctx.modules.compiled_text_encoder["encode_tokens"](
+        *prompt_encoder_inputs
+    )
+    for i in encoded_outputs:
+        i = i.to_host()
+    del prompt_encoder_inputs
+    return encoded_outputs
+
+
+def run_tokenize(
+    tokenizer,
+    prompt,
+    negative_prompt,
+):
+
+    prompt_tokens_dict = tokenizer.tokenize_with_weights(prompt)
+    neg_prompt_tokens_dict = tokenizer.tokenize_with_weights(negative_prompt)
+    text_input_ids_list = list(prompt_tokens_dict.values())
+    uncond_input_ids_list = list(neg_prompt_tokens_dict.values())
+    return text_input_ids_list, uncond_input_ids_list
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    tokenizer = SD3Tokenizer()
+
+    text_input_ids_list, uncond_input_ids_list = run_tokenize(
+        tokenizer,
+        args.prompt,
+        args.negative_prompt,
+    )
+    turbine_output1, turbine_output2 = run_prompt_encoder(
+        args.vmfb_path,
+        args.rt_device,
+        args.external_weight_path,
+        text_input_ids_list,
+        uncond_input_ids_list,
+    )
+    print(
+        "TURBINE OUTPUT 1:",
+        turbine_output1.to_host(),
+        turbine_output1.shape,
+        turbine_output1.dtype,
+    )
+
+    print(
+        "TURBINE OUTPUT 2:",
+        turbine_output2.to_host(),
+        turbine_output2.shape,
+        turbine_output2.dtype,
+    )
+
+    if args.compare_vs_torch:
+        print("generating torch output: ")
+        from turbine_models.custom_models.sd_inference import utils
+        from turbine_models.custom_models.sd3_inference.sd3_text_encoders import (
+            TextEncoderModule,
+        )
+
+        torch_encoder_model = TextEncoderModule(
+            args.batch_size,
+        )
+        torch_output1, torch_output2 = torch_encoder_model.forward(
+            *text_input_ids_list, *uncond_input_ids_list
+        )
+        np.save("torch_output1.npy", torch_output1)
+        np.save("torch_output2.npy", torch_output2)
+        print(
+            "TORCH OUTPUT 1:", torch_output1, torch_output1.shape, torch_output1.dtype
+        )
+
+        print(
+            "TORCH OUTPUT 2:", torch_output2, torch_output2.shape, torch_output2.dtype
+        )
+        rtol = 4e-2
+        atol = 4e-2
+
+        np.testing.assert_allclose(
+            torch_output1, turbine_output1.to_host(), rtol, atol, verbose=True
+        )
+        np.testing.assert_allclose(
+            torch_output2, turbine_output2.to_host(), rtol, atol, verbose=True
+        )
+        print("Passed!")
+    # TODO: Figure out why we occasionally segfault without unlinking output variables
+    turbine_output1, turbine_output2 = (None, None)
diff --git a/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py b/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
new file mode 100644
index 000000000..09a69fe9c
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
@@ -0,0 +1,537 @@
+### This file contains impls for underlying related models (CLIP, T5, etc)
+
+import torch, math
+from torch import nn
+from transformers import CLIPTokenizer, T5TokenizerFast
+from shark_turbine import ops 
+
+#################################################################################################
+### Core/Utility
+#################################################################################################
+
+
+def attention(q, k, v, heads, mask=None):
+    """Convenience wrapper around a basic attention operation"""
+    b, _, dim_head = q.shape
+    #ops.iree.trace_tensor("attention_q", q[0,0,:5])
+    #ops.iree.trace_tensor("attention_k", k[0,0,:5])
+    #ops.iree.trace_tensor("attention_v", v[0,0,:5])
+    dim_head //= heads
+    q, k, v = map(lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2), (q, k, v))
+    out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+    #ops.iree.trace_tensor("attention_out", out[0,0,:5])
+    return out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, dtype=None, device=None):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, dtype=dtype, device=device)
+        self.act = act_layer
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        #ops.iree.trace_tensor("mlpfx", x[0,0,:5])
+        x = self.act(x)
+        #ops.iree.trace_tensor("mlpact", x[0,0,:5])
+        x = self.fc2(x)
+        #ops.iree.trace_tensor("mlpanotherfc", x[0,0,:5])
+        return x
+
+def load_into(f, model, prefix, device, dtype=None):
+    """Just a debugging-friendly hack to apply the weights in a safetensors file to the pytorch module."""
+    for key in f.keys():
+        if key.startswith(prefix) and not key.startswith("loss."):
+            path = key[len(prefix):].split(".")
+            obj = model
+            for p in path:
+                if obj is list:
+                    obj = obj[int(p)]
+                else:
+                    obj = getattr(obj, p, None)
+                    if obj is None:
+                        print(f"Skipping key '{key}' in safetensors file as '{p}' does not exist in python model")
+                        break
+            if obj is None:
+                continue
+            try:
+                tensor = f.get_tensor(key).to(device=device)
+                if dtype is not None:
+                    tensor = tensor.to(dtype=dtype)
+                obj.requires_grad_(False)
+                obj.set_(tensor)
+            except Exception as e:
+                print(f"Failed to load key '{key}' in safetensors file: {e}")
+                raise e
+
+#################################################################################################
+### CLIP
+#################################################################################################
+
+
+class CLIPAttention(torch.nn.Module):
+    def __init__(self, embed_dim, heads, dtype, device):
+        super().__init__()
+        self.heads = heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x, mask=None):
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        out = attention(q, k, v, self.heads, mask)
+        return self.out_proj(out)
+
+
+ACTIVATIONS = {
+    "quick_gelu": lambda a: a * torch.sigmoid(1.702 * a),
+    "gelu": torch.nn.functional.gelu,
+}
+
+class CLIPLayer(torch.nn.Module):
+    def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+        self.self_attn = CLIPAttention(embed_dim, heads, dtype, device)
+        self.layer_norm2 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+        #self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device)
+        self.mlp = Mlp(embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation], dtype=dtype, device=device)
+
+    def forward(self, x, mask=None):
+        x += self.self_attn(self.layer_norm1(x), mask)
+        x += self.mlp(self.layer_norm2(x))
+        return x
+
+
+class CLIPEncoder(torch.nn.Module):
+    def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+        super().__init__()
+        self.layers = torch.nn.ModuleList([CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device) for i in range(num_layers)])
+
+    def forward(self, x, mask=None, intermediate_output=None):
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layers) + intermediate_output
+        intermediate = None
+        for i, l in enumerate(self.layers):
+            x = l(x, mask)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+
+class CLIPEmbeddings(torch.nn.Module):
+    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
+        super().__init__()
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+
+    def forward(self, input_tokens):
+        return self.token_embedding(input_tokens) + self.position_embedding.weight
+
+
+class CLIPTextModel_(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device):
+        num_layers = config_dict["num_hidden_layers"]
+        embed_dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        intermediate_size = config_dict["intermediate_size"]
+        intermediate_activation = config_dict["hidden_act"]
+        super().__init__()
+        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
+        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+
+    def forward(self, input_tokens, intermediate_output=None, final_layer_norm_intermediate=True):
+        x = self.embeddings(input_tokens)
+        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
+        x, i = self.encoder(x, mask=causal_mask, intermediate_output=intermediate_output)
+        x = self.final_layer_norm(x)
+        if i is not None and final_layer_norm_intermediate:
+            i = self.final_layer_norm(i)
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
+        return x, i, pooled_output
+
+
+class CLIPTextModel(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device):
+        super().__init__()
+        self.num_layers = config_dict["num_hidden_layers"]
+        self.text_model = CLIPTextModel_(config_dict, dtype, device)
+        embed_dim = config_dict["hidden_size"]
+        self.text_projection = nn.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
+        self.text_projection.weight.copy_(torch.eye(embed_dim))
+        self.dtype = dtype
+
+    def get_input_embeddings(self):
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, embeddings):
+        self.text_model.embeddings.token_embedding = embeddings
+
+    def forward(self, *args, **kwargs):
+        x = self.text_model(*args, **kwargs)
+        out = self.text_projection(x[2])
+        return (x[0], x[1], out, x[2])
+
+class SDTokenizer:
+    def __init__(self, max_length=77, pad_with_end=True, tokenizer=None, has_start_token=True, pad_to_max_length=True, min_length=None):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.min_length = min_length
+        empty = self.tokenizer('')["input_ids"]
+        if has_start_token:
+            self.tokens_start = 1
+            self.start_token = empty[0]
+            self.end_token = empty[1]
+        else:
+            self.tokens_start = 0
+            self.start_token = None
+            self.end_token = empty[0]
+        self.pad_with_end = pad_with_end
+        self.pad_to_max_length = pad_to_max_length
+        vocab = self.tokenizer.get_vocab()
+        self.inv_vocab = {v: k for k, v in vocab.items()}
+        self.max_word_length = 8
+
+
+    def tokenize_with_weights(self, text:str):
+        """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here. The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
+        if self.pad_with_end:
+            pad_token = self.end_token
+        else:
+            pad_token = 0
+        batch = []
+        if self.start_token is not None:
+            batch.append((self.start_token, 1.0))
+        to_tokenize = text.replace("\n", " ").split(' ')
+        to_tokenize = [x for x in to_tokenize if x != ""]
+        for word in to_tokenize:
+            batch.extend([(t, 1) for t in self.tokenizer(word)["input_ids"][self.tokens_start:-1]])
+        batch.append((self.end_token, 1.0))
+        if self.pad_to_max_length:
+            batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
+        if self.min_length is not None and len(batch) < self.min_length:
+            batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
+        return [batch]
+
+
+class SDXLClipGTokenizer(SDTokenizer):
+    def __init__(self, tokenizer):
+        super().__init__(pad_with_end=False, tokenizer=tokenizer)
+
+
+class SD3Tokenizer:
+    def __init__(self):
+        clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.clip_l = SDTokenizer(tokenizer=clip_tokenizer)
+        self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
+        self.t5xxl = T5XXLTokenizer()
+
+    def tokenize_with_weights(self, text:str):
+        out = {}
+        out["g"] = self.clip_g.tokenize_with_weights(text)
+        out["l"] = self.clip_l.tokenize_with_weights(text)
+        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text)
+        for k, v in out.items():
+            out[k] = torch.tensor(v, dtype=torch.int64, device="cpu")
+        return out
+
+
+class ClipTokenWeightEncoder:
+    def encode_token_weights(self, token_weight_pairs):
+        #tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
+        tokens = token_weight_pairs[:,:,0]
+        out, pooled = self(tokens)
+        if pooled is not None:
+            first_pooled = pooled[0:1].cpu()
+        else:
+            first_pooled = pooled
+        output = [out[0:1]]
+        return torch.cat(output, dim=-2).cpu(), first_pooled
+
+
+class SDClipModel(torch.nn.Module):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    LAYERS = ["last", "pooled", "hidden"]
+    def __init__(self, device="cpu", max_length=77, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=CLIPTextModel,
+                 special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, return_projected_pooled=True):
+        super().__init__()
+        assert layer in self.LAYERS
+        self.transformer = model_class(textmodel_json_config, dtype, device)
+        self.num_layers = self.transformer.num_layers
+        self.max_length = max_length
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+        self.layer = layer
+        self.layer_idx = None
+        self.special_tokens = special_tokens
+        self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
+        self.layer_norm_hidden_state = layer_norm_hidden_state
+        self.return_projected_pooled = return_projected_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert abs(layer_idx) < self.num_layers
+            self.set_clip_options({"layer": layer_idx})
+        self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled)
+
+    def encode_token_weights(self, token_weight_pairs):
+        pass
+
+    def set_clip_options(self, options):
+        layer_idx = options.get("layer", self.layer_idx)
+        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
+        if layer_idx is None or abs(layer_idx) > self.num_layers:
+            self.layer = "last"
+        else:
+            self.layer = "hidden"
+            self.layer_idx = layer_idx
+
+    def forward(self, token_weight_pairs):
+        #tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
+        tokens = token_weight_pairs[:,:,0]
+        #backup_embeds = self.transformer.get_input_embeddings()
+        #device = backup_embeds.weight.device
+        #tokens = torch.LongTensor(tokens).to(device)
+        outputs = self.transformer(tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state)
+        #self.transformer.set_input_embeddings(backup_embeds)
+        if self.layer == "last":
+            z = outputs[0]
+        else:
+            z = outputs[1]
+        pooled_output = None
+        if len(outputs) >= 3:
+            if not self.return_projected_pooled and len(outputs) >= 4 and outputs[3] is not None:
+                pooled_output = outputs[3].float()
+            elif outputs[2] is not None:
+                pooled_output = outputs[2].float()
+        out, pooled = z.float(), pooled_output
+        if pooled is not None:
+            first_pooled = pooled[0:1].cpu()
+        else:
+            first_pooled = pooled
+        output = [out[0:1]]
+        return torch.cat(output, dim=-2).cpu(), first_pooled
+
+
+class SDXLClipG(SDClipModel):
+    """Wraps the CLIP-G model into the SD-CLIP-Model interface"""
+    def __init__(self, config, device="cpu", layer="penultimate", layer_idx=None, dtype=None):
+        if layer == "penultimate":
+            layer="hidden"
+            layer_idx=-2
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)
+
+
+class T5XXLModel(SDClipModel):
+    """Wraps the T5-XXL model into the SD-CLIP-Model interface for convenience"""
+    def __init__(self, config, device="cpu", layer="last", layer_idx=None, dtype=None):
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=T5)
+
+
+#################################################################################################
+### T5 implementation, for the T5-XXL text encoder portion, largely pulled from upstream impl
+#################################################################################################
+
+
+class T5XXLTokenizer(SDTokenizer):
+    """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
+    def __init__(self):
+        super().__init__(pad_with_end=False, tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"), has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
+
+
+class T5LayerNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight.to(device=x.device, dtype=x.dtype) * x
+
+
+class T5DenseGatedActDense(torch.nn.Module):
+    def __init__(self, model_dim, ff_dim, dtype, device):
+        super().__init__()
+        self.wi_0 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+        self.wi_1 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+        self.wo = nn.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x):
+        hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh")
+        hidden_linear = self.wi_1(x)
+        x = hidden_gelu * hidden_linear
+        x = self.wo(x)
+        return x
+
+
+class T5LayerFF(torch.nn.Module):
+    def __init__(self, model_dim, ff_dim, dtype, device):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device)
+        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+    def forward(self, x):
+        forwarded_states = self.layer_norm(x)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        x += forwarded_states
+        return x
+
+
+class T5Attention(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device):
+        super().__init__()
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.k = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.v = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.o = nn.Linear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)
+        self.num_heads = num_heads
+        self.relative_attention_bias = None
+        if relative_attention_bias:
+            self.relative_attention_num_buckets = 32
+            self.relative_attention_max_distance = 128
+            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1))
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device):
+        """Compute binned relative position bias"""
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=True,
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(self, x, past_bias=None):
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+        if self.relative_attention_bias is not None:
+            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)
+        if past_bias is not None:
+            mask = past_bias
+        out = attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask)
+        return self.o(out), past_bias
+
+
+class T5LayerSelfAttention(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+        super().__init__()
+        self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device)
+        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+    def forward(self, x, past_bias=None):
+        output, past_bias = self.SelfAttention(self.layer_norm(x), past_bias=past_bias)
+        x += output
+        return x, past_bias
+
+
+class T5Block(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+        super().__init__()
+        self.layer = torch.nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device))
+        self.layer.append(T5LayerFF(model_dim, ff_dim, dtype, device))
+
+    def forward(self, x, past_bias=None):
+        x, past_bias = self.layer[0](x, past_bias)
+        x = self.layer[-1](x)
+        return x, past_bias
+
+
+class T5Stack(torch.nn.Module):
+    def __init__(self, num_layers, model_dim, inner_dim, ff_dim, num_heads, vocab_size, dtype, device):
+        super().__init__()
+        self.embed_tokens = torch.nn.Embedding(vocab_size, model_dim, device=device)
+        self.block = torch.nn.ModuleList([T5Block(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device) for i in range(num_layers)])
+        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+    def forward(self, input_ids, intermediate_output=None, final_layer_norm_intermediate=True):
+        intermediate = None
+        x = self.embed_tokens(input_ids)
+        past_bias = None
+        for i, l in enumerate(self.block):
+            x, past_bias = l(x, past_bias)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        x = self.final_layer_norm(x)
+        if intermediate is not None and final_layer_norm_intermediate:
+            intermediate = self.final_layer_norm(intermediate)
+        return x, intermediate
+
+
+class T5(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device):
+        super().__init__()
+        self.num_layers = config_dict["num_layers"]
+        self.encoder = T5Stack(self.num_layers, config_dict["d_model"], config_dict["d_model"], config_dict["d_ff"], config_dict["num_heads"], config_dict["vocab_size"], dtype, device)
+        self.dtype = dtype
+
+    def get_input_embeddings(self):
+        return self.encoder.embed_tokens
+
+    def set_input_embeddings(self, embeddings):
+        self.encoder.embed_tokens = embeddings
+
+    def forward(self, *args, **kwargs):
+        return self.encoder(*args, **kwargs)
diff --git a/models/turbine_models/custom_models/sd3_inference/turbine_mmdit.py b/models/turbine_models/custom_models/sd3_inference/turbine_mmdit.py
new file mode 100644
index 000000000..1cdebc076
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/turbine_mmdit.py
@@ -0,0 +1,217 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import copy
+import os
+import sys
+import math
+
+from safetensors import safe_open
+from iree import runtime as ireert
+from iree.compiler.ir import Context
+import numpy as np
+from shark_turbine.aot import *
+from shark_turbine.dynamo.passes import (
+    DEFAULT_DECOMPOSITIONS,
+)
+from turbine_models.custom_models.sd_inference import utils
+import torch
+import torch._dynamo as dynamo
+from diffusers import SD3Transformer2DModel
+
+
+class MMDiTModel(torch.nn.Module):
+    def __init__(
+            self,
+            hf_model_name = "stabilityai/stable-diffusion-3-medium-diffusers",
+            dtype=torch.float16,
+        ):
+        super().__init__()
+        self.mmdit = SD3Transformer2DModel.from_pretrained(
+            hf_model_name,
+            subfolder="transformer",
+            torch_dtype=dtype,
+            low_cpu_mem_usage=False,
+        )
+        
+
+    def forward(
+        self, hidden_states, encoder_hidden_states, pooled_projections, timestep, lora_scale,
+    ):
+        joint_attention_kwargs = {
+            "scale": lora_scale,
+        }
+        noise_pred = self.mmdit(hidden_states, encoder_hidden_states, pooled_projections, timestep,joint_attention_kwargs, return_dict=False)[0]
+        return noise_pred
+
+
+@torch.no_grad()
+def export_mmdit_model(
+    mmdit_model,
+    hf_model_name,
+    batch_size,
+    height,
+    width,
+    precision="fp32",
+    max_length=77,
+    hf_auth_token=None,
+    compile_to="torch",
+    external_weights=None,
+    external_weight_path=None,
+    device=None,
+    target_triple=None,
+    ireec_flags=None,
+    decomp_attn=False,
+    exit_on_vmfb=False,
+    pipeline_dir=None,
+    attn_spec=None,
+    input_mlir=None,
+    weights_only=False,
+):
+    dtype = torch.float16 if args.precision == "fp16" else torch.float32
+    if pipeline_dir:
+        safe_name = os.path.join(pipeline_dir, f"mmdit")
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name,
+            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_mmdit",
+        )
+    if decomp_attn == True:
+        ireec_flags += ",--iree-opt-aggressively-propagate-transposes=False"
+
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name + "_" + target_triple,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+            attn_spec=attn_spec,
+        )
+        return vmfb_path
+
+    mapper = {}
+
+    utils.save_external_weights(
+        mapper, mmdit_model, external_weights, external_weight_path
+    )
+
+    if weights_only:
+        return external_weight_path
+
+    do_classifier_free_guidance = True
+    init_batch_dim = 2 if do_classifier_free_guidance else 1
+
+    hidden_states_shape = (
+        batch_size,
+        16,
+        height // 8,
+        width // 8,
+    )
+    encoder_hidden_states_shape = (batch_size, 77, 4096)
+    pooled_projections_shape = (batch_size, 2048)
+    example_forward_args = [
+        torch.empty(hidden_states_shape, dtype=dtype),
+        torch.empty(encoder_hidden_states_shape, dtype=dtype),
+        torch.empty(pooled_projections_shape, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+    ]
+
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+        fxb = FxProgramsBuilder(mmdit_model)
+
+        @fxb.export_program(
+            args=(example_forward_args,),
+        )
+        def _forward(
+            module,
+            inputs,
+        ):
+            return module.forward(*inputs)
+
+        class CompiledMmdit(CompiledModule):
+            run_forward = _forward
+
+        if external_weights:
+            externalize_module_parameters(mmdit_model)
+
+        inst = CompiledMmdit(context=Context(), import_to="IMPORT")
+
+        module_str = str(CompiledModule.get_mlir_module(inst))
+
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        vmfb_path = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=True,
+            attn_spec=attn_spec,
+        )
+        if exit_on_vmfb:
+            exit()
+    return vmfb_path
+
+
+if __name__ == "__main__":
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    if args.input_mlir:
+        mmdit_model = None
+    else:
+        mmdit_model = MMDiTModel(
+            args.hf_model_name,
+            dtype=torch.float16 if args.precision == "fp16" else torch.float32
+        )
+    mod_str = export_mmdit_model(
+        mmdit_model,
+        args.hf_model_name,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.precision,
+        args.max_length,
+        args.hf_auth_token,
+        args.compile_to,
+        args.external_weights,
+        args.external_weight_path,
+        args.device,
+        args.iree_target_triple,
+        args.ireec_flags + args.attn_flags + args.unet_flags,
+        args.decomp_attn,
+        attn_spec=args.attn_spec,
+        input_mlir=args.input_mlir,
+        weights_only=args.weights_only,
+    )
+    if args.input_mlir:
+        exit()
+    safe_name = utils.create_safe_name(
+        args.hf_model_name,
+        f"_bs{args.batch_size}_{args.max_length}_{args.height}x{args.width}_{args.precision}_mmdit",
+    )
+    if args.compile_to != "vmfb":
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 840c8bd1a..ce48dff33 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -3,6 +3,7 @@
 import numpy as np
 import os
 import safetensors
+import safetensors.numpy as safe_numpy
 import re
 from diffusers import (
     PNDMScheduler,
@@ -270,14 +271,22 @@ def save_external_weights(
     model,
     external_weights=None,
     external_weight_file=None,
+    force_format=False,
 ):
     if external_weights is not None:
         if external_weights in ["safetensors", "irpa"]:
             mod_params = dict(model.named_parameters())
+            mod_buffers = dict(model.named_buffers())
+            mod_params.update(mod_buffers)
             for name in mod_params:
                 mapper["params." + name] = name
             if external_weight_file and not os.path.isfile(external_weight_file):
-                safetensors.torch.save_file(mod_params, external_weight_file)
+                if not force_format:
+                    safetensors.torch.save_file(mod_params, external_weight_file)
+                else:
+                    for x in mod_params.keys():
+                        mod_params[x] = mod_params[x].numpy()
+                    safe_numpy.save_file(mod_params, external_weight_file)
                 print("Saved params to", external_weight_file)
 
 

From d3f06f6222e350f603f5231681640e614fd9e062 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 13 Jun 2024 05:52:36 -0500
Subject: [PATCH 119/174] Achieve basic functionality for sd3 txt2img

---
 models/requirements.txt                       |   2 +-
 models/setup.py                               |   2 +-
 .../sd3_inference/sd3_cmd_opts.py             |  16 +-
 .../custom_models/sd3_inference/sd3_full.py   | 277 +++++++
 .../{turbine_mmdit.py => sd3_mmdit.py}        |  35 +-
 .../sd3_inference/sd3_mmdit_runner.py         |  26 +-
 .../sd3_inference/sd3_pipeline.py             | 696 ++++++++++++++++++
 .../sd3_inference/sd3_schedulers.py           |  15 +-
 .../sd3_inference/sd3_text_encoders.py        |  41 +-
 .../sd3_inference/sd3_text_encoders_runner.py |   9 +-
 .../custom_models/sd3_inference/sd3_vae.py    | 196 +++++
 .../sd3_inference/sd3_vae_runner.py           |  77 ++
 .../sd3_inference/text_encoder_impls.py       | 413 +++++++++--
 13 files changed, 1659 insertions(+), 146 deletions(-)
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_full.py
 rename models/turbine_models/custom_models/sd3_inference/{turbine_mmdit.py => sd3_mmdit.py} (89%)
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_vae.py
 create mode 100644 models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py

diff --git a/models/requirements.txt b/models/requirements.txt
index b775c76cd..87f92e7c6 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -4,7 +4,7 @@ shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 transformers==4.37.1
 torchsde
 accelerate
-diffusers @ git+https://github.com/nod-ai/diffusers@v0.28.2-shark
+diffusers @ git+https://github.com/nod-ai/diffusers@0.29.0.dev0-shark
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
 azure-storage-blob
diff --git a/models/setup.py b/models/setup.py
index e051b665e..2c54c7d43 100644
--- a/models/setup.py
+++ b/models/setup.py
@@ -59,7 +59,7 @@ def load_version_info():
         "sentencepiece",
         "transformers==4.37.1",
         "accelerate",
-        "diffusers==0.24.0",
+        "diffusers==0.29.0.dev0",
         "azure-storage-blob",
         "einops",
     ],
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
index 535135daa..e072fad2c 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -95,7 +95,7 @@ def is_valid_file(arg):
 p.add_argument(
     "--guidance_scale",
     type=float,
-    default=7.5,
+    default=4,
     help="Scale by which to adjust prompt guidance to the unconditional noise prediction output of UNet after each iteration.",
 )
 
@@ -207,9 +207,15 @@ def is_valid_file(arg):
 p.add_argument(
     "--vae_decomp_attn",
     type=bool,
-    default=False,
+    default=True,
     help="Decompose attention for VAE decode only at fx graph level",
 )
+p.add_argument(
+    "--vae_dtype",
+    type=str,
+    default="fp32",
+    help="Precision of VAE graph.",
+)
 
 ##############################################################################
 # SD3 script general options.
@@ -271,11 +277,7 @@ def is_valid_file(arg):
     default=None,
     help="Azure storage container name to download mlir files from.",
 )
-p.add_argument(
-    "--export",
-    type=str,
-    default="all",
-    help="clip, mmdit, vae, all")
+p.add_argument("--export", type=str, default="all", help="clip, mmdit, vae, all")
 p.add_argument(
     "--output",
     type=str,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_full.py b/models/turbine_models/custom_models/sd3_inference/sd3_full.py
new file mode 100644
index 000000000..f88cda03f
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_full.py
@@ -0,0 +1,277 @@
+# Copyrigh 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import sys
+
+from iree import runtime as ireert
+from iree.compiler.ir import Context
+import numpy as np
+from shark_turbine.aot import *
+from turbine_models.custom_models.sd_inference import utils
+import torch
+import torch._dynamo as dynamo
+
+import safetensors
+import argparse
+from turbine_models.turbine_tank import turbine_tank
+
+SEED = 1
+
+
+def export_vae(
+    model,
+    height,
+    width,
+    compile_to="torch",
+    external_weight_prefix=None,
+    device=None,
+    target_triple=None,
+    max_alloc="",
+    upload_ir=False,
+    dtype=torch.float32,
+):
+    mapper = {}
+    utils.save_external_weights(mapper, model, "safetensors", external_weight_prefix)
+    latent_shape = [1, 16, height // 8, width // 8]
+    input_arg = torch.empty(latent_shape)
+    input_arg = (input_arg.to(dtype),)
+    if external_weight_prefix != None and len(external_weight_prefix) > 1:
+        externalize_module_parameters(model)
+
+    exported = export(model, args=input_arg)
+
+    module_str = str(exported.mlir_module)
+    safe_name = utils.create_safe_name(str(dtype).lstrip("torch."), "_mmdit")
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        print("compiling to vmfb")
+        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
+        return module_str
+
+
+def export_unet_dynamic(
+    unet_model,
+    height,
+    width,
+    compile_to="torch",
+    external_weight_path=None,
+    device=None,
+    target_triple=None,
+    max_alloc="",
+    upload_ir=False,
+    dtype=torch.float32,
+):
+    cond_shape = [1, 154, 4096]  # 77, 4096]
+    pool_shape = [1, 2048]
+    latent_shape = [1, 16, height // 8, width // 8]
+    if dtype == torch.float16:
+        unet_model = unet_model.half()
+    mapper = {}
+    utils.save_external_weights(mapper, unet_model, "safetensors", external_weight_path)
+
+    if weights_only:
+        return external_weight_path
+
+    fxb = FxProgramsBuilder(unet_model)
+
+    sigmas = torch.export.Dim("sigmas")
+    dynamic_shapes = {"sigmas": {0: sigmas}, "latent": {}, "noise": {}}
+    example_init_args = [
+        torch.empty([19], dtype=dtype),
+        torch.empty(latent_shape, dtype=dtype),
+        torch.empty(latent_shape, dtype=dtype),
+    ]
+    example_sampling_args = [
+        torch.empty(latent_shape, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(cond_shape, dtype=dtype),
+        torch.empty(pool_shape, dtype=dtype),
+        torch.empty(cond_shape, dtype=dtype),
+        torch.empty(pool_shape, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+    ]
+
+    @fxb.export_program(args=(example_init_args,), dynamic_shapes=dynamic_shapes)
+    def _initialize(module, inputs):
+        # 1.0 is denoise currently symfloat not supported in fx_importer
+        return module.init_dynamic(*inputs)
+
+    @fxb.export_program(args=(example_sampling_args,))
+    def _do_sampling(module, inputs):
+        return module.do_sampling(*inputs)
+
+    class CompiledTresleches(CompiledModule):
+        initialize = _initialize
+        do_sampling = _do_sampling
+
+    # _vae_decode = vae_decode
+
+    if external_weights:
+        externalize_module_parameters(unet_model)
+        save_module_parameters(external_weight_path, unet_model)
+
+    inst = CompiledTresleches(context=Context(), import_to="IMPORT")
+    module_str = str(CompiledModule.get_mlir_module(inst))
+    print("exported model")
+
+    safe_name = utils.create_safe_name(str(dtype).lstrip("torch."), "_mmdit")
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        print("compiling to vmfb")
+        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
+        return module_str
+
+
+def export_preprocessor(
+    model,
+    compile_to="torch",
+    external_weight_path=None,
+    device=None,
+    target_triple=None,
+    max_alloc="",
+    dtype=torch.float32,
+    height=512,
+    width=512,
+):
+    external_weights = "safetensors"
+
+    def get_noise():
+        latent = torch.ones(1, 16, height // 8, width // 8, device="cpu") * 0.0609
+        generator = torch.manual_seed(SEED)
+        return torch.randn(
+            latent.size(),
+            dtype=latent.dtype,
+            layout=latent.layout,
+            generator=generator,
+            device="cpu",
+        )
+
+    input_args = [torch.empty([1, 77, 2], dtype=torch.int64) for x in range(6)]
+    input_args += get_noise()
+    if dtype == torch.float16:
+        model = model.half()
+
+    mapper = {}
+
+    utils.save_external_weights(mapper, model, external_weights, external_weight_path)
+
+    if external_weight_path != None and len(external_weight_path) > 1:
+        print("externalizing weights")
+        externalize_module_parameters(model)
+
+    exported = export(model, args=tuple(input_args))
+    print("exported model")
+
+    # import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
+    # inst = CompiledTresleches(context=Context(), import_to=import_to)
+
+    # module_str = str(CompiledModule.get_mlir_module(inst))
+    module_str = str(exported.mlir_module)
+    safe_name = utils.create_safe_name("sd3", "clips")
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        print("compiling to vmfb")
+        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
+        return module_str
+
+
+@torch.no_grad()
+def main(args):
+    import turbine_sd3
+    from safetensors import safe_open
+
+    vulkan_max_allocation = "4294967296" if args.device == "vulkan" else ""
+    # st_file = "/mnt2/tresleches/models/sd3_8b_beta.safetensors"
+    st_file = "/mnt2/tresleches/models/sd3_2b_512_alpha.safetensors"
+    dtype = torch.float32
+    if args.precision == "f16":
+        dtype = torch.float16
+    elif args.precision == "bf16":
+        dtype = torch.bfloat16
+    print(args.export)
+
+    if args.export in ["dynamic"]:
+        print("exporting dynamic")
+        unet_model = turbine_sd3.SD3Inferencer(
+            model=st_file, vae=turbine_sd3.VAEFile, shift=1.0, dtype=dtype
+        ).eval()
+        mod_str = export_unet_dynamic(
+            unet_model=unet_model,
+            height=args.height,
+            width=args.width,
+            compile_to=args.compile_to,
+            external_weight_path=args.external_weight_path,
+            device=args.device,
+            target_triple=args.iree_target_triple,
+            max_alloc=vulkan_max_allocation,
+            upload_ir=False,
+            dtype=dtype,
+        )
+        safe_name = utils.create_safe_name("hc_sd3", "-unet")
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
+    export_pre = args.export in ["all", "clip"]
+    print(export_pre)
+    if export_pre:
+        print("exporting preprocessor")
+        pre = turbine_sd3.Preprocess()
+        mod_str = export_preprocessor(
+            model=pre,
+            compile_to=args.compile_to,
+            external_weight_path=args.external_weight_path,
+            device=args.device,
+            target_triple=args.iree_target_triple,
+            max_alloc=vulkan_max_allocation,
+            dtype=dtype,
+            height=args.height,
+            width=args.width,
+        )
+        safe_name = utils.create_safe_name("hc_sd3", "_preprocess")
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
+    should_export_vae = args.export in ["all", "vae"]
+    if should_export_vae:
+        print("exporting vae")
+        from turbine_impls import SDVAE
+
+        with turbine_sd3.safe_open(
+            turbine_sd3.VAEFile, framework="pt", device="cpu"
+        ) as f:
+            vae = SDVAE(device="cpu", dtype=dtype).eval().cpu()
+            prefix = ""
+            if any(k.startswith("first_stage_model.") for k in f.keys()):
+                prefix = "first_stage_model."
+            turbine_sd3.load_into(f, vae, prefix, "cpu", dtype)
+            print("Something")
+        mod_str = export_vae(
+            model=vae,
+            height=args.height,
+            width=args.width,
+            compile_to=args.compile_to,
+            external_weight_prefix=args.external_weight_path,
+            device=args.device,
+            target_triple=args.iree_target_triple,
+            max_alloc=vulkan_max_allocation,
+            dtype=dtype,
+        )
+        safe_name = utils.create_safe_name("hc_sd3", "_vae")
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(mod_str)
+        print("Saved to", safe_name + ".mlir")
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    torch._dynamo.config.capture_scalar_outputs = True
+    main(args)
diff --git a/models/turbine_models/custom_models/sd3_inference/turbine_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
similarity index 89%
rename from models/turbine_models/custom_models/sd3_inference/turbine_mmdit.py
rename to models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index 1cdebc076..85414a1e1 100644
--- a/models/turbine_models/custom_models/sd3_inference/turbine_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -25,10 +25,10 @@
 
 class MMDiTModel(torch.nn.Module):
     def __init__(
-            self,
-            hf_model_name = "stabilityai/stable-diffusion-3-medium-diffusers",
-            dtype=torch.float16,
-        ):
+        self,
+        hf_model_name="stabilityai/stable-diffusion-3-medium-diffusers",
+        dtype=torch.float16,
+    ):
         super().__init__()
         self.mmdit = SD3Transformer2DModel.from_pretrained(
             hf_model_name,
@@ -36,15 +36,21 @@ def __init__(
             torch_dtype=dtype,
             low_cpu_mem_usage=False,
         )
-        
 
     def forward(
-        self, hidden_states, encoder_hidden_states, pooled_projections, timestep, lora_scale,
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        pooled_projections,
+        timestep,
     ):
-        joint_attention_kwargs = {
-            "scale": lora_scale,
-        }
-        noise_pred = self.mmdit(hidden_states, encoder_hidden_states, pooled_projections, timestep,joint_attention_kwargs, return_dict=False)[0]
+        noise_pred = self.mmdit(
+            hidden_states,
+            encoder_hidden_states,
+            pooled_projections,
+            timestep,
+            return_dict=False,
+        )[0]
         return noise_pred
 
 
@@ -71,7 +77,7 @@ def export_mmdit_model(
     input_mlir=None,
     weights_only=False,
 ):
-    dtype = torch.float16 if args.precision == "fp16" else torch.float32
+    dtype = torch.float16 if precision == "fp16" else torch.float32
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, f"mmdit")
     else:
@@ -106,21 +112,20 @@ def export_mmdit_model(
 
     do_classifier_free_guidance = True
     init_batch_dim = 2 if do_classifier_free_guidance else 1
-
+    batch_size = batch_size * init_batch_dim
     hidden_states_shape = (
         batch_size,
         16,
         height // 8,
         width // 8,
     )
-    encoder_hidden_states_shape = (batch_size, 77, 4096)
+    encoder_hidden_states_shape = (batch_size, 154, 4096)
     pooled_projections_shape = (batch_size, 2048)
     example_forward_args = [
         torch.empty(hidden_states_shape, dtype=dtype),
         torch.empty(encoder_hidden_states_shape, dtype=dtype),
         torch.empty(pooled_projections_shape, dtype=dtype),
         torch.empty(1, dtype=dtype),
-        torch.empty(1, dtype=dtype),
     ]
 
     decomp_list = []
@@ -183,7 +188,7 @@ class CompiledMmdit(CompiledModule):
     else:
         mmdit_model = MMDiTModel(
             args.hf_model_name,
-            dtype=torch.float16 if args.precision == "fp16" else torch.float32
+            dtype=torch.float16 if args.precision == "fp16" else torch.float32,
         )
     mod_str = export_mmdit_model(
         mmdit_model,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
index fe3ae2b4e..d730e140e 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
@@ -15,10 +15,8 @@ def run_mmdit_turbine(
     encoder_hidden_states,
     pooled_projections,
     timestep,
-    lora_scale,
     args,
 ):
-    torch_dtype = torch.float16 if args.precision == "fp16" else torch.float32
     mmdit_runner = vmfbRunner(
         args.device,
         args.vmfb_path,
@@ -29,9 +27,10 @@ def run_mmdit_turbine(
         ireert.asdevicearray(mmdit_runner.config.device, encoder_hidden_states),
         ireert.asdevicearray(mmdit_runner.config.device, pooled_projections),
         ireert.asdevicearray(mmdit_runner.config.device, timestep),
-        ireert.asdevicearray(mmdit_runner.config.device, lora_scale),
     ]
-    noise_pred = mmdit_runner.ctx.modules.compiled_mmdit["run_forward"](*iree_inputs).to_host()
+    noise_pred = mmdit_runner.ctx.modules.compiled_mmdit["run_forward"](
+        *iree_inputs
+    ).to_host()
     return noise_pred
 
 
@@ -41,16 +40,19 @@ def run_diffusers_mmdit(
     encoder_hidden_states,
     pooled_projections,
     timestep,
-    lora_scale,
     args,
 ):
-    from turbine_models.custom_models.sd3_inference.turbine_mmdit import MMDiTModel
+    from turbine_models.custom_models.sd3_inference.sd3_mmdit import MMDiTModel
+
     mmdit_model = MMDiTModel(
         args.hf_model_name,
         dtype=torch.float32,
     )
     noise_pred = mmdit_model.forward(
-        hidden_states.float(), encoder_hidden_states.float(), pooled_projections.float(), timestep.float(), lora_scale.float()
+        hidden_states.float(),
+        encoder_hidden_states.float(),
+        pooled_projections.float(),
+        timestep.float(),
     )
 
     return noise_pred.numpy()
@@ -72,18 +74,16 @@ def run_diffusers_mmdit(
         (args.batch_size, 16, args.height // 8, args.width // 8), dtype=dtype
     )
     encoder_hidden_states = torch.randn(
-        (args.batch_size, args.max_length, 4096), dtype=dtype
+        (args.batch_size, args.max_length * 2, 4096), dtype=dtype
     )
     pooled_projections = torch.randn((args.batch_size, 2048), dtype=dtype)
     timestep = torch.tensor([0], dtype=dtype)
-    lora_scale = torch.tensor([1.0], dtype=dtype)
 
     turbine_output = run_mmdit_turbine(
         hidden_states,
         encoder_hidden_states,
         pooled_projections,
         timestep,
-        lora_scale,
         args,
     )
     print(
@@ -101,15 +101,11 @@ def run_diffusers_mmdit(
             encoder_hidden_states,
             pooled_projections,
             timestep,
-            lora_scale,
             args,
         )
         print("torch OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
 
         print("\n(torch (comfy) image latents to iree image latents): ")
 
-        np.testing.assert_allclose(
-            turbine_output, torch_output, rtol=4e-2, atol=4e-2
-        )
+        np.testing.assert_allclose(turbine_output, torch_output, rtol=4e-2, atol=4e-2)
         print("passed!")
-
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
new file mode 100644
index 000000000..cd629f5ad
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -0,0 +1,696 @@
+# Copyright 2024 Advanced Micro Devices, inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import torch
+from turbine_models.custom_models.sd3_inference import (
+    sd3_text_encoders,
+    sd3_mmdit,
+    sd3_vae,
+    sd3_schedulers,
+)
+from turbine_models.custom_models.sd3_inference.text_encoder_impls import SD3Tokenizer
+import iree.runtime as ireert
+from turbine_models.custom_models.sd_inference import utils
+from turbine_models.model_runner import vmfbRunner
+from transformers import CLIPTokenizer
+
+from PIL import Image
+import os
+import numpy as np
+import time
+import copy
+from datetime import datetime as dt
+
+device_list = [
+    "cpu",
+    "vulkan",
+    "cuda",
+    "rocm",
+]
+
+rt_device_list = [
+    "local-task",
+    "local-sync",
+    "vulkan",
+    "cuda",
+    "rocm",
+    "hip",
+]
+
+empty_pipe_dict = {
+    "vae": None,
+    "text_encoders": None,
+    "mmdit": None,
+    "scheduler": None,
+}
+
+EMPTY_FLAGS = {
+    "clip": None,
+    "mmdit": None,
+    "vae": None,
+    "pipeline": None,
+}
+
+
+class SharkSD3Pipeline:
+    def __init__(
+        self,
+        hf_model_name: str,
+        # scheduler_id: str,
+        height: int,
+        width: int,
+        shift: float,
+        precision: str,
+        max_length: int,
+        batch_size: int,
+        num_inference_steps: int,
+        device: str,
+        iree_target_triple: str,
+        ireec_flags: dict = EMPTY_FLAGS,
+        attn_spec: str = None,
+        decomp_attn: bool = False,
+        pipeline_dir: str = "./shark_vmfbs",
+        external_weights_dir: str = "./shark_weights",
+        external_weights: str = "safetensors",
+        vae_decomp_attn: bool = True,
+        custom_vae: str = "",
+        cpu_scheduling: bool = False,
+    ):
+        self.hf_model_name = hf_model_name
+        # self.scheduler_id = scheduler_id
+        self.height = height
+        self.width = width
+        self.shift = shift
+        self.precision = precision
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.num_inference_steps = num_inference_steps
+        self.device = device
+        self.iree_target_triple = iree_target_triple
+        self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
+        self.attn_spec = attn_spec
+        self.decomp_attn = decomp_attn
+        self.pipeline_dir = pipeline_dir
+        self.external_weights_dir = external_weights_dir
+        self.external_weights = external_weights
+        self.vae_decomp_attn = vae_decomp_attn
+        self.custom_vae = custom_vae
+        self.cpu_scheduling = cpu_scheduling
+        self.torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
+        self.vae_dtype = torch.float32
+        # TODO: set this based on user-inputted guidance scale and negative prompt.
+        self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
+
+    # FILE MANAGEMENT AND PIPELINE SETUP
+
+    def check_prepared(
+        self,
+        mlirs: dict,
+        vmfbs: dict,
+        weights: dict,
+        interactive: bool = True,
+    ):
+        ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
+        if not ready:
+            if interactive:
+                do_continue = input(
+                    f"\nIt seems you are missing some necessary files. Would you like to generate them now? (y/n)"
+                )
+                if do_continue.lower() != "y":
+                    exit()
+            else:
+                do_continue = "y"
+            if do_continue.lower() == "y":
+                for submodel in vmfbs.keys():
+                    if vmfbs[submodel] == None:
+                        print(submodel)
+                        vmfb, weight = self.export_submodel(submodel, input_mlir=mlirs)
+                        vmfbs[submodel] = vmfb
+                        if weights[submodel] is None:
+                            weights[submodel] = weight
+                    elif weights[submodel] is None and not any(
+                        x in submodel for x in ["pipeline", "scheduler"]
+                    ):
+                        _, weight = self.export_submodel(submodel, weights_only=True)
+                        weights[submodel] = weight
+                ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
+                if ready:
+                    print("All necessary files found.")
+                    return vmfbs, weights
+                else:
+                    print("There was an error generating the necessary files.")
+                    exit()
+        else:
+            print("All necessary files found. Loading pipeline.")
+        return vmfbs, weights
+
+    def is_prepared(self, vmfbs, weights):
+        missing = []
+        for key in vmfbs:
+            if key == "scheduler" and not self.cpu_scheduling:
+                val = f"EulerFlowScheduler_{self.num_inference_steps}"
+                default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
+            elif key == "scheduler":
+                val = None
+                default_filepath = None
+                continue
+            else:
+                val = vmfbs[key]
+                default_filepath = os.path.join(self.pipeline_dir, key + ".vmfb")
+            if vmfbs[key] is not None and os.path.exists(vmfbs[key]):
+                continue
+            elif vmfbs[key] == None and os.path.exists(default_filepath):
+                vmfbs[key] = default_filepath
+            elif val is None:
+                missing.append(key + ".vmfb")
+            else:
+                missing.append(val + ".vmfb")
+        for w_key in weights:
+            if any(x in w_key for x in ["pipeline", "scheduler"]):
+                continue
+            if weights[w_key] is not None:
+                continue
+            if self.external_weights is None:
+                continue
+            default_name = os.path.join(
+                self.external_weights_dir, w_key + "." + self.external_weights
+            )
+            if w_key == "text_encoders":
+                default_name = os.path.join(
+                    self.external_weights_dir, f"sd3_clip_fp16.irpa"
+                )
+            if weights[w_key] is None and os.path.exists(default_name):
+                weights[w_key] = os.path.join(default_name)
+            else:
+                missing.append(w_key + "." + self.external_weights)
+        if len(missing) > 0:
+            print(f"Missing files: " + ", ".join(missing))
+            return False, vmfbs, weights
+        else:
+            return True, vmfbs, weights
+
+    def get_mlir_from_turbine_tank(self, submodel, container_name):
+        from turbine_models.turbine_tank import downloadModelArtifacts
+
+        safe_name = utils.create_safe_name(
+            self.hf_model_name,
+            f"_{self.max_length}_{self.height}x{self.width}_{self.precision}_{submodel}.mlir",
+        )
+        mlir_path = downloadModelArtifacts(
+            safe_name,
+            container_name,
+        )
+        return mlir_path
+
+    # IMPORT / COMPILE PHASE
+
+    def get_torch_models(self, submodel):
+        match submodel:
+            case "vae":
+                vae_torch = sd3_vae.VaeModel(
+                    # This is a public model, so no auth required
+                    self.hf_model_name,
+                )
+                return vae_torch
+            case "mmdit":
+                mmdit_torch = sd3_mmdit.MMDiTModel(
+                    dtype=self.torch_dtype,
+                )
+                return mmdit_torch
+
+    def export_submodel(
+        self,
+        submodel: str,
+        input_mlir: str = None,
+        weights_only: bool = False,
+    ):
+        if not os.path.exists(self.pipeline_dir):
+            os.makedirs(self.pipeline_dir)
+        if self.external_weights and self.external_weights_dir:
+            if not os.path.exists(self.external_weights_dir):
+                os.makedirs(self.external_weights_dir, exist_ok=True)
+            vae_external_weight_path = os.path.join(
+                self.external_weights_dir, "vae." + self.external_weights
+            )
+            mmdit_external_weight_path = os.path.join(
+                self.external_weights_dir,
+                f"sd3_mmdit_{self.precision}." + self.external_weights,
+            )
+            text_encoders_external_weight_path = os.path.join(
+                self.external_weights_dir, f"sd3_text_encoders_{self.precision}.irpa"
+            )
+        elif self.external_weights is None:
+            print(
+                "No external weights type specified using --external_weights, weights for imported .mlir files will not be externalized."
+            )
+            vae_external_weight_path = None
+            mmdit_external_weight_path = None
+            text_encoders_external_weight_path = None
+        else:
+            print(
+                f"No external weights directory specified using --external_weights_dir, we assume you have your own weights in {self.pipeline_dir}."
+            )
+            if not os.path.exists(self.pipeline_dir):
+                os.makedirs(self.pipeline_dir, exist_ok=True)
+            vae_external_weight_path = os.path.join(
+                self.pipeline_dir, "vae." + self.external_weights
+            )
+            mmdit_external_weight_path = os.path.join(
+                self.pipeline_dir,
+                f"sd3_mmdit_{self.precision}." + self.external_weights,
+            )
+            text_encoders_external_weight_path = os.path.join(
+                self.pipeline_dir, f"sd3_text_encoders_{self.precision}.irpa"
+            )
+        if weights_only:
+            input_mlir = {
+                "vae": None,
+                "text_encoders": None,
+                "mmdit": None,
+                "scheduler": None,
+            }
+        match submodel:
+            case "mmdit":
+                if not input_mlir[submodel]:
+                    mmdit_torch = self.get_torch_models("mmdit")
+                else:
+                    mmdit_torch = None
+                mmdit_vmfb = sd3_mmdit.export_mmdit_model(
+                    mmdit_torch,
+                    self.hf_model_name,
+                    self.batch_size,
+                    self.height,
+                    self.width,
+                    self.precision,
+                    self.max_length,
+                    None,
+                    "vmfb",
+                    self.external_weights,
+                    mmdit_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["mmdit"],
+                    self.decomp_attn,
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    attn_spec=self.attn_spec,
+                    input_mlir=input_mlir["mmdit"],
+                    weights_only=weights_only,
+                )
+                del mmdit_torch
+                return mmdit_vmfb, mmdit_external_weight_path
+            case "scheduler":
+                scheduler_vmfb = sd3_schedulers.export_scheduler_model(
+                    self.hf_model_name,
+                    self.batch_size,
+                    self.height,
+                    self.width,
+                    self.shift,
+                    self.num_inference_steps,
+                    self.precision,
+                    "vmfb",
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["scheduler"],
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    input_mlir=input_mlir["scheduler"],
+                )
+                return scheduler_vmfb, None
+            case "vae":
+                if not input_mlir[submodel]:
+                    vae_torch = self.get_torch_models("vae")
+                else:
+                    vae_torch = None
+                vae_vmfb = sd3_vae.export_vae_model(
+                    vae_torch,
+                    self.hf_model_name,
+                    self.batch_size,
+                    self.height,
+                    self.width,
+                    "fp32",
+                    "vmfb",
+                    self.external_weights,
+                    vae_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["vae"],
+                    self.vae_decomp_attn,
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    attn_spec=self.attn_spec,
+                    input_mlir=input_mlir["vae"],
+                    weights_only=weights_only,
+                )
+                del vae_torch
+                return vae_vmfb, vae_external_weight_path
+            case "text_encoders":
+                _, text_encoders_vmfb = sd3_text_encoders.export_text_encoders(
+                    self.hf_model_name,
+                    None,
+                    self.max_length,
+                    self.precision,
+                    "vmfb",
+                    self.external_weights,
+                    text_encoders_external_weight_path,
+                    self.device,
+                    self.iree_target_triple,
+                    self.ireec_flags["clip"],
+                    exit_on_vmfb=False,
+                    pipeline_dir=self.pipeline_dir,
+                    input_mlir=input_mlir["text_encoders"],
+                    attn_spec=self.attn_spec,
+                    output_batchsize=self.batch_size,
+                )
+                return text_encoders_vmfb, text_encoders_external_weight_path
+
+    # LOAD
+
+    def load_pipeline(
+        self,
+        vmfbs: dict,
+        weights: dict,
+        rt_device: str = "local-task",
+        compiled_pipeline: bool = False,
+        split_scheduler: bool = True,
+    ):
+        self.runners = {}
+        runners = {}
+        load_start = time.time()
+        runners["pipe"] = vmfbRunner(
+            rt_device,
+            vmfbs["mmdit"],
+            weights["mmdit"],
+        )
+        unet_loaded = time.time()
+        print("\n[LOG] MMDiT loaded in ", unet_loaded - load_start, "sec")
+
+        runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
+            rt_device,
+            vmfbs["scheduler"],
+        )
+
+        sched_loaded = time.time()
+        print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
+        runners["vae"] = vmfbRunner(
+            rt_device,
+            vmfbs["vae"],
+            weights["vae"],
+        )
+        vae_loaded = time.time()
+        print("\n[LOG] VAE Decode loaded in ", vae_loaded - sched_loaded, "sec")
+        runners["text_encoders"] = vmfbRunner(
+            rt_device,
+            vmfbs["text_encoders"],
+            weights["text_encoders"],
+        )
+        clip_loaded = time.time()
+        print("\n[LOG] Text Encoders loaded in ", clip_loaded - vae_loaded, "sec")
+
+        tok_start = time.time()
+        self.tokenizer = SD3Tokenizer()
+        tok_loaded = time.time()
+        print("\n[LOG] Tokenizers loaded in ", tok_loaded - tok_start, "sec")
+        self.runners = runners
+        self.compiled_pipeline = compiled_pipeline
+        print("Successfully loaded pipeline.")
+
+    # RUN
+
+    def generate_images(
+        self,
+        prompt: str,
+        negative_prompt: str = "",
+        batch_count: int = 1,
+        guidance_scale: float = 4,
+        seed: float = -1,
+        return_imgs: bool = False,
+    ):
+        # TODO: implement case where this is false e.g. in SDXL Turbo
+        do_classifier_free_guidance = True
+
+        # Workaround for turbo support (guidance_scale 0)
+        if guidance_scale == 0:
+            negative_prompt = prompt
+            prompt = ""
+
+        iree_dtype = "float32" if self.precision == "fp32" else "float16"
+        torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
+
+        samples = []
+        numpy_images = []
+
+        for i in range(batch_count):
+            generator = torch.random.manual_seed(seed + i)
+            rand_sample = torch.randn(
+                (
+                    self.batch_size,
+                    16,
+                    self.height // 8,
+                    self.width // 8,
+                ),
+                generator=generator,
+                dtype=torch_dtype,
+            )
+            samples.append(
+                ireert.asdevicearray(
+                    self.runners["pipe"].config.device, rand_sample, dtype=iree_dtype
+                )
+            )
+
+        guidance_scale = ireert.asdevicearray(
+            self.runners["pipe"].config.device,
+            np.asarray([guidance_scale]),
+            dtype=iree_dtype,
+        )
+
+        tokenize_start = time.time()
+        text_input_ids_dict = self.tokenizer.tokenize_with_weights(prompt)
+        uncond_input_ids_dict = self.tokenizer.tokenize_with_weights(negative_prompt)
+        text_input_ids_list = list(text_input_ids_dict.values())
+        uncond_input_ids_list = list(uncond_input_ids_dict.values())
+        text_encoders_inputs = [
+            ireert.asdevicearray(
+                self.runners["text_encoders"].config.device, text_input_ids_list[0]
+            ),
+            ireert.asdevicearray(
+                self.runners["text_encoders"].config.device, text_input_ids_list[1]
+            ),
+            ireert.asdevicearray(
+                self.runners["text_encoders"].config.device, text_input_ids_list[2]
+            ),
+            ireert.asdevicearray(
+                self.runners["text_encoders"].config.device, uncond_input_ids_list[0]
+            ),
+            ireert.asdevicearray(
+                self.runners["text_encoders"].config.device, uncond_input_ids_list[1]
+            ),
+            ireert.asdevicearray(
+                self.runners["text_encoders"].config.device, uncond_input_ids_list[2]
+            ),
+        ]
+
+        # Tokenize prompt and negative prompt.
+        encode_prompts_start = time.time()
+        prompt_embeds, pooled_prompt_embeds = self.runners[
+            "text_encoders"
+        ].ctx.modules.compiled_text_encoder["encode_tokens"](*text_encoders_inputs)
+
+        encode_prompts_end = time.time()
+
+        for i in range(batch_count):
+            unet_start = time.time()
+            sample, steps, timesteps = self.runners["scheduler"].initialize(samples[i])
+            iree_inputs = [
+                sample,
+                ireert.asdevicearray(
+                    self.runners["pipe"].config.device, prompt_embeds, dtype=iree_dtype
+                ),
+                ireert.asdevicearray(
+                    self.runners["pipe"].config.device,
+                    pooled_prompt_embeds,
+                    dtype=iree_dtype,
+                ),
+                None,
+            ]
+            for s in range(steps):
+                # print(f"step {s}")
+                if self.cpu_scheduling:
+                    step_index = s
+                else:
+                    step_index = ireert.asdevicearray(
+                        self.runners["scheduler"].runner.config.device,
+                        torch.tensor([s]),
+                        "int64",
+                    )
+                latents, t = self.runners["scheduler"].prep(
+                    sample,
+                    step_index,
+                    timesteps,
+                )
+                noise_pred = self.runners["pipe"].ctx.modules.compiled_mmdit[
+                    "run_forward"
+                ](
+                    latents,
+                    iree_inputs[1],
+                    iree_inputs[2],
+                    t,
+                )
+                sample = self.runners["scheduler"].step(
+                    noise_pred,
+                    t,
+                    sample,
+                    guidance_scale,
+                    step_index,
+                )
+            if isinstance(sample, torch.Tensor):
+                latents = ireert.asdevicearray(
+                    self.runners["vae"].config.device,
+                    sample,
+                    dtype=self.vae_dtype,
+                )
+            else:
+                latents = sample.astype("float32")
+
+            vae_start = time.time()
+            vae_out = self.runners["vae"].ctx.modules.compiled_vae["decode"](latents)
+
+            pipe_end = time.time()
+
+            image = vae_out.to_host()
+
+            numpy_images.extend([image])
+            print("Batch #", i + 1, "\n")
+            print(
+                "UNet time(",
+                self.num_inference_steps,
+                "): ",
+                vae_start - unet_start,
+                "sec,",
+            )
+            print(
+                "Unet average step latency: ",
+                (vae_start - unet_start) / self.num_inference_steps,
+                "sec",
+            )
+            print("VAE time: ", pipe_end - vae_start, "sec")
+            print(
+                f"\nTotal time (txt2img, batch #{str(i+1)}): ",
+                (encode_prompts_end - encode_prompts_start) + (pipe_end - unet_start),
+                "sec\n",
+            )
+            end = time.time()
+            print("Total CLIP time:", encode_prompts_end - encode_prompts_start, "sec")
+            print("Total tokenize time:", encode_prompts_start - tokenize_start, "sec")
+            if batch_count > 1:
+                print(
+                    f"Total inference time ({batch_count} batch(es)):",
+                    end - encode_prompts_start,
+                    "sec",
+                )
+        timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
+        images = []
+        for idx, image in enumerate(numpy_images):
+            if image.ndim == 4:
+                image = image[0]
+            image = torch.from_numpy(image).cpu().permute(1, 2, 0).float().numpy()
+            image = (image * 255).round().astype("uint8")
+            out_image = Image.fromarray(image)
+            images.extend([[out_image]])
+        if return_imgs:
+            return images
+        for idx_batch, image_batch in enumerate(images):
+            for idx, image in enumerate(image_batch):
+                img_path = (
+                    "sd3_output_"
+                    + timestamp
+                    + "_"
+                    + str(idx_batch)
+                    + "_"
+                    + str(idx)
+                    + ".png"
+                )
+                image.save(img_path)
+                print(img_path, "saved")
+        return
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    map = empty_pipe_dict
+    mlirs = copy.deepcopy(map)
+    vmfbs = copy.deepcopy(map)
+    weights = copy.deepcopy(map)
+    ireec_flags = {
+        "clip": args.ireec_flags + args.clip_flags,
+        "mmdit": args.ireec_flags + args.unet_flags,
+        "vae": args.ireec_flags + args.vae_flags,
+        "pipeline": args.ireec_flags,
+        "scheduler": args.ireec_flags,
+    }
+    if not args.pipeline_dir:
+        pipe_id_list = [
+            args.hf_model_name.split("/")[-1],
+            str(args.height),
+            str(args.width),
+            str(args.max_length),
+            args.precision,
+            args.device,
+        ]
+        if args.decomp_attn:
+            pipe_id_list.append("decomp")
+        args.pipeline_dir = os.path.join(
+            ".",
+            "_".join(pipe_id_list),
+        )
+    if args.input_mlir:
+        user_mlir_list = args.input_mlir.split(",")
+    else:
+        user_mlir_list = []
+    for submodel_id, mlir_path in zip(mlirs.keys(), user_mlir_list):
+        if submodel_id in mlir_path:
+            mlirs[submodel_id] = mlir_path
+    if not args.external_weights_dir and args.external_weights:
+        args.external_weights_dir = args.pipeline_dir
+    sd3_pipe = SharkSD3Pipeline(
+        args.hf_model_name,
+        args.height,
+        args.width,
+        args.shift,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.num_inference_steps,
+        args.device,
+        args.iree_target_triple,
+        ireec_flags,
+        args.attn_spec,
+        args.decomp_attn,
+        args.pipeline_dir,
+        args.external_weights_dir,
+        args.external_weights,
+        args.vae_decomp_attn,
+        custom_vae=None,
+        cpu_scheduling=args.cpu_scheduling,
+    )
+    vmfbs, weights = sd3_pipe.check_prepared(mlirs, vmfbs, weights)
+    if args.cpu_scheduling:
+        vmfbs.pop("scheduler")
+        weights.pop("scheduler")
+    sd3_pipe.load_pipeline(
+        vmfbs, weights, args.rt_device, args.compiled_pipeline, args.split_scheduler
+    )
+    sd3_pipe.generate_images(
+        args.prompt,
+        args.negative_prompt,
+        args.batch_count,
+        args.guidance_scale,
+        args.seed,
+        False,
+    )
+    print("Image generation complete.")
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 87492a701..0d4078605 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -28,12 +28,12 @@ def __init__(self, rt_device, vmfb):
         self.runner = vmfbRunner(rt_device, vmfb, None)
 
     def initialize(self, sample):
-        sample, time_ids, steps, timesteps = self.runner.ctx.modules.compiled_scheduler[
+        sample, steps, timesteps = self.runner.ctx.modules.compiled_scheduler[
             "run_init"
         ](sample)
         return sample, steps.to_host(), timesteps
 
-    def prepare_model_input(self, sample, t, timesteps):
+    def prep(self, sample, t, timesteps):
         return self.runner.ctx.modules.compiled_scheduler["run_prep"](
             sample, t, timesteps
         )
@@ -54,7 +54,9 @@ def __init__(
         super().__init__()
         # For now, assumes SDXL implementation. May not need parametrization for other models,
         # but keeping hf_model_name in case.
-        self.model = FlowMatchEulerDiscreteScheduler.from_pretrained(hf_model_name, subfolder="scheduler")
+        self.model = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            hf_model_name, subfolder="scheduler"
+        )
         self.do_classifier_free_guidance = True
         self.model.set_timesteps(num_inference_steps)
         self.timesteps = self.model.timesteps
@@ -149,6 +151,7 @@ def export_scheduler_model(
     batch_size: int = 1,
     height: int = 512,
     width: int = 512,
+    shift: int = 1.0,
     num_inference_steps: int = 30,
     precision: str = "fp16",
     compile_to: str = "torch",
@@ -161,9 +164,7 @@ def export_scheduler_model(
     upload_ir=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
-    scheduler_module = FlowSchedulingModel(
-        hf_model_name, num_inference_steps, dtype
-    )
+    scheduler_module = FlowSchedulingModel(hf_model_name, num_inference_steps, dtype)
     if pipeline_dir:
         vmfb_names = [
             "EulerFlowScheduler",
@@ -291,6 +292,7 @@ class CompiledScheduler(CompiledModule):
             exit()
         return vmfb
 
+
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
 
@@ -299,6 +301,7 @@ class CompiledScheduler(CompiledModule):
         args.batch_size,
         args.height,
         args.width,
+        args.shift,
         args.num_inference_steps,
         args.precision,
         args.compile_to,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
index 895f27bf7..89bee3cb1 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -15,7 +15,12 @@
 from shark_turbine.aot import *
 from turbine_models.custom_models.sd_inference import utils
 import torch
-from turbine_models.custom_models.sd3_inference.text_encoder_impls import SDClipModel, SDXLClipG, T5XXLModel, load_into
+from turbine_models.custom_models.sd3_inference.text_encoder_impls import (
+    SDClipModel,
+    SDXLClipG,
+    T5XXLModel,
+    load_into,
+)
 from huggingface_hub import hf_hub_download
 from safetensors import safe_open
 
@@ -24,7 +29,7 @@
     "hidden_size": 1280,
     "intermediate_size": 5120,
     "num_attention_heads": 20,
-    "num_hidden_layers": 32
+    "num_hidden_layers": 32,
 }
 
 CLIPL_CONFIG = {
@@ -32,7 +37,7 @@
     "hidden_size": 768,
     "intermediate_size": 3072,
     "num_attention_heads": 12,
-    "num_hidden_layers": 12
+    "num_hidden_layers": 12,
 }
 
 T5_CONFIG = {
@@ -40,9 +45,10 @@
     "d_model": 4096,
     "num_heads": 64,
     "num_layers": 24,
-    "vocab_size": 32128
+    "vocab_size": 32128,
 }
 
+
 class TextEncoderModule(torch.nn.Module):
     @torch.no_grad()
     def __init__(
@@ -58,25 +64,25 @@ def __init__(
             dtype=self.dtype,
             layer_norm_hidden_state=False,
             return_projected_pooled=False,
-            textmodel_json_config=CLIPL_CONFIG
+            textmodel_json_config=CLIPL_CONFIG,
         ).half()
         clip_l_weights = hf_hub_download(
             repo_id="stabilityai/stable-diffusion-3-medium",
-            filename="text_encoders/clip_l.safetensors"
+            filename="text_encoders/clip_l.safetensors",
         )
         with safe_open(clip_l_weights, framework="pt", device="cpu") as f:
             load_into(f, self.clip_l.transformer, "", "cpu", self.dtype)
         self.clip_g = SDXLClipG(CLIPG_CONFIG, device="cpu", dtype=self.dtype).half()
         clip_g_weights = hf_hub_download(
             repo_id="stabilityai/stable-diffusion-3-medium",
-            filename="text_encoders/clip_g.safetensors"
+            filename="text_encoders/clip_g.safetensors",
         )
         with safe_open(clip_g_weights, framework="pt", device="cpu") as f:
             load_into(f, self.clip_g.transformer, "", "cpu", self.dtype)
         self.t5xxl = T5XXLModel(T5_CONFIG, device="cpu", dtype=self.dtype).half()
         t5_weights = hf_hub_download(
             repo_id="stabilityai/stable-diffusion-3-medium",
-            filename="text_encoders/t5xxl_fp16.safetensors"
+            filename="text_encoders/t5xxl_fp16.safetensors",
         )
         with safe_open(t5_weights, framework="pt", device="cpu") as f:
             load_into(f, self.t5xxl.transformer, "", "cpu", self.dtype)
@@ -90,7 +96,9 @@ def get_cond(self, tokens_l, tokens_g, tokens_t5xxl):
         t5_out, _ = self.t5xxl.forward(tokens_t5xxl)
         lg_out = torch.cat([l_out, g_out], dim=-1)
         lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
-        return torch.cat([lg_out, t5_out], dim=-2), torch.cat((l_pooled, g_pooled), dim=-1)
+        return torch.cat([lg_out, t5_out], dim=-2), torch.cat(
+            (l_pooled, g_pooled), dim=-1
+        )
 
     def forward(self, tokens_g, tokens_l, tokens_t5xxl, neg_g, neg_l, neg_t5):
         conditioning, cond_pool = self.get_cond(tokens_l, tokens_g, tokens_t5xxl)
@@ -101,8 +109,9 @@ def forward(self, tokens_g, tokens_l, tokens_t5xxl, neg_g, neg_l, neg_t5):
 
         return prompt_embeds, pooled_prompt_embeds
 
+
 @torch.no_grad()
-def export_text_encoder(
+def export_text_encoders(
     hf_model_name,
     hf_auth_token=None,
     max_length=64,
@@ -144,9 +153,11 @@ def export_text_encoder(
     )
     mapper = {}
 
-    assert ".safetensors" not in external_weight_path, "Original parameters format incompatible with IREE safetensors parser. Use '.irpa' instead."
-    
-    input_args = [torch.empty([1,77,2], dtype=torch.int64) for x in range(6)]
+    assert (
+        ".safetensors" not in external_weight_path
+    ), "Original parameters format incompatible with IREE safetensors parser. Use '.irpa' instead."
+
+    input_args = [torch.empty([1, 77, 2], dtype=torch.int64) for x in range(6)]
 
     decomp_list = []
     if decomp_attn == True:
@@ -200,7 +211,7 @@ class CompiledTextEncoder(CompiledModule):
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
 
-    mod_str, _ = export_text_encoder(
+    mod_str, _ = export_text_encoders(
         args.hf_model_name,
         args.hf_auth_token,
         args.max_length,
@@ -217,7 +228,7 @@ class CompiledTextEncoder(CompiledModule):
         attn_spec=args.attn_spec,
         output_batchsize=args.batch_size,
     )
-    if args.input_mlir or args.weights_only or args.compile_to=="vmfb":
+    if args.input_mlir or args.weights_only or args.compile_to == "vmfb":
         exit()
     safe_name = utils.create_safe_name(
         args.hf_model_name, f"_{str(args.max_length)}_{args.precision}_text_encoders"
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
index 1093f4b27..3a590b62c 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
@@ -26,11 +26,10 @@ def run_prompt_encoder(
         ireert.asdevicearray(prompt_encoder_runner.config.device, uncond_input_ids[0]),
         ireert.asdevicearray(prompt_encoder_runner.config.device, uncond_input_ids[1]),
         ireert.asdevicearray(prompt_encoder_runner.config.device, uncond_input_ids[2]),
-
     ]
-    encoded_outputs = prompt_encoder_runner.ctx.modules.compiled_text_encoder["encode_tokens"](
-        *prompt_encoder_inputs
-    )
+    encoded_outputs = prompt_encoder_runner.ctx.modules.compiled_text_encoder[
+        "encode_tokens"
+    ](*prompt_encoder_inputs)
     for i in encoded_outputs:
         i = i.to_host()
     del prompt_encoder_inputs
@@ -42,13 +41,13 @@ def run_tokenize(
     prompt,
     negative_prompt,
 ):
-
     prompt_tokens_dict = tokenizer.tokenize_with_weights(prompt)
     neg_prompt_tokens_dict = tokenizer.tokenize_with_weights(negative_prompt)
     text_input_ids_list = list(prompt_tokens_dict.values())
     uncond_input_ids_list = list(neg_prompt_tokens_dict.values())
     return text_input_ids_list, uncond_input_ids_list
 
+
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
 
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
new file mode 100644
index 000000000..9789be7cd
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
@@ -0,0 +1,196 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import copy
+import os
+import sys
+
+from iree import runtime as ireert
+from iree.compiler.ir import Context
+import numpy as np
+from shark_turbine.aot import *
+from shark_turbine.dynamo.passes import (
+    DEFAULT_DECOMPOSITIONS,
+)
+from turbine_models.custom_models.sd_inference import utils
+import torch
+import torch._dynamo as dynamo
+from diffusers import AutoencoderKL
+
+
+class VaeModel(torch.nn.Module):
+    def __init__(
+        self,
+        hf_model_name,
+    ):
+        super().__init__()
+        self.vae = AutoencoderKL.from_pretrained(
+            hf_model_name,
+            subfolder="vae",
+        )
+
+    def decode(self, inp):
+        image = self.vae.decode(inp, return_dict=False)[0]
+        image = image.float()
+        image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
+        return image
+
+    def encode(self, inp):
+        image_np = inp / 255.0
+        image_np = np.moveaxis(image_np, 2, 0)
+        batch_images = np.expand_dims(image_np, axis=0).repeat(1, axis=0)
+        image_torch = torch.from_numpy(batch_images)
+        image_torch = 2.0 * image_torch - 1.0
+        image_torch = image_torch
+        latent = self.vae.encode(image_torch)
+        return latent
+
+
+def export_vae_model(
+    vae_model,
+    hf_model_name,
+    batch_size,
+    height,
+    width,
+    precision,
+    compile_to="torch",
+    external_weights=None,
+    external_weight_path=None,
+    device=None,
+    target_triple=None,
+    ireec_flags=None,
+    decomp_attn=False,
+    exit_on_vmfb=False,
+    pipeline_dir=None,
+    attn_spec=None,
+    input_mlir=None,
+    weights_only=False,
+):
+    dtype = torch.float16 if precision == "fp16" else torch.float32
+    if pipeline_dir:
+        safe_name = os.path.join(pipeline_dir, "vae")
+    else:
+        safe_name = utils.create_safe_name(
+            hf_model_name,
+            f"_bs{batch_size}_{height}x{width}_{precision}_vae_{device}",
+        )
+    if input_mlir:
+        vmfb_path = utils.compile_to_vmfb(
+            input_mlir,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            mlir_source="file",
+            return_path=not exit_on_vmfb,
+            attn_spec=attn_spec,
+        )
+        return vmfb_path
+
+    if dtype == torch.float16:
+        vae_model = vae_model.half()
+    mapper = {}
+    utils.save_external_weights(
+        mapper, vae_model, external_weights, external_weight_path
+    )
+    if weights_only:
+        return external_weight_path
+
+    input_image_shape = (height, width, 3)
+    input_latents_shape = (batch_size, 16, height // 8, width // 8)
+    encode_args = [
+        torch.empty(
+            input_image_shape,
+            dtype=torch.float32,
+        )
+    ]
+    decode_args = [
+        torch.empty(
+            input_latents_shape,
+            dtype=dtype,
+        )
+    ]
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+        fxb = FxProgramsBuilder(vae_model)
+
+        # @fxb.export_program(args=(encode_args,))
+        # def _encode(module, inputs,):
+        #     return module.encode(*inputs)
+
+        @fxb.export_program(args=(decode_args,))
+        def _decode(module, inputs):
+            return module.decode(*inputs)
+
+        class CompiledVae(CompiledModule):
+            decode = _decode
+
+        if external_weights:
+            externalize_module_parameters(vae_model)
+
+        inst = CompiledVae(context=Context(), import_to="IMPORT")
+
+        module_str = str(CompiledModule.get_mlir_module(inst))
+
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        vmfb_path = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=not exit_on_vmfb,
+            attn_spec=attn_spec,
+        )
+        return vmfb_path
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    if args.input_mlir:
+        vae_model = None
+    else:
+        vae_model = VaeModel(
+            args.hf_model_name,
+        )
+    mod_str = export_vae_model(
+        vae_model,
+        args.hf_model_name,
+        args.batch_size,
+        height=args.height,
+        width=args.width,
+        precision=args.precision,
+        compile_to=args.compile_to,
+        external_weights=args.external_weights,
+        external_weight_path=args.external_weight_path,
+        device=args.device,
+        target_triple=args.iree_target_triple,
+        ireec_flags=args.ireec_flags + args.attn_flags + args.vae_flags,
+        decomp_attn=args.decomp_attn,
+        attn_spec=args.attn_spec,
+        input_mlir=args.input_mlir,
+    )
+    if args.input_mlir or (args.compile_to == "vmfb"):
+        exit()
+    safe_name = utils.create_safe_name(
+        args.hf_model_name,
+        f"_bs{str(args.batch_size)}_{args.height}x{args.width}_{args.precision}_vae",
+    )
+    with open(f"{safe_name}.mlir", "w+") as f:
+        f.write(mod_str)
+    print("Saved to", safe_name + ".mlir")
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
new file mode 100644
index 000000000..23db4ab73
--- /dev/null
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
@@ -0,0 +1,77 @@
+import argparse
+from turbine_models.model_runner import vmfbRunner
+from iree import runtime as ireert
+import torch
+
+torch.random.manual_seed(0)
+
+
+def run_vae(
+    device,
+    example_input,
+    vmfb_path,
+    hf_model_name,
+    external_weight_path,
+):
+    runner = vmfbRunner(device, vmfb_path, external_weight_path)
+    inputs = [ireert.asdevicearray(runner.config.device, example_input)]
+    results = runner.ctx.modules.compiled_vae["decode"](*inputs)
+
+    return results
+
+
+def run_torch_vae(hf_model_name, variant, example_input):
+    from turbine_models.custom_models.sd3_inference.sd3_vae import VaeModel
+
+    vae_model = VaeModel(
+        hf_model_name,
+    )
+
+    if variant == "decode":
+        results = vae_model.decode(example_input)
+    elif variant == "encode":
+        results = vae_model.encode(example_input)
+    np_torch_output = results.detach().cpu().numpy()
+    return np_torch_output
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+
+    dtype = torch.float16 if args.precision == "fp16" else torch.float32
+    if args.vae_variant == "decode":
+        example_input = torch.rand(
+            args.batch_size, 16, args.height // 8, args.width // 8, dtype=dtype
+        )
+    elif args.vae_variant == "encode":
+        example_input = torch.rand(
+            args.batch_size, 3, args.height, args.width, dtype=dtype
+        )
+    print("generating turbine output:")
+    turbine_results = run_vae(
+        args.device,
+        example_input,
+        args.vmfb_path,
+        args.hf_model_name,
+        args.external_weight_path,
+    )
+    print(
+        "TURBINE OUTPUT:",
+        turbine_results.to_host(),
+        turbine_results.to_host().shape,
+        turbine_results.to_host().dtype,
+    )
+    if args.compare_vs_torch:
+        print("generating torch output: ")
+        from turbine_models.custom_models.sd_inference import utils
+
+        torch_output = run_torch_vae(
+            args.hf_model_name, args.vae_variant, example_input.float()
+        )
+        print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
+        err = utils.largest_error(torch_output, turbine_results)
+        print("Largest Error: ", err)
+        assert err < 2e-3
+
+    # TODO: Figure out why we occasionally segfault without unlinking output variables
+    turbine_results = None
diff --git a/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py b/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
index 09a69fe9c..29b9d2f80 100644
--- a/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
+++ b/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
@@ -3,7 +3,7 @@
 import torch, math
 from torch import nn
 from transformers import CLIPTokenizer, T5TokenizerFast
-from shark_turbine import ops 
+from shark_turbine import ops
 
 #################################################################################################
 ### Core/Utility
@@ -13,41 +13,58 @@
 def attention(q, k, v, heads, mask=None):
     """Convenience wrapper around a basic attention operation"""
     b, _, dim_head = q.shape
-    #ops.iree.trace_tensor("attention_q", q[0,0,:5])
-    #ops.iree.trace_tensor("attention_k", k[0,0,:5])
-    #ops.iree.trace_tensor("attention_v", v[0,0,:5])
+    # ops.iree.trace_tensor("attention_q", q[0,0,:5])
+    # ops.iree.trace_tensor("attention_k", k[0,0,:5])
+    # ops.iree.trace_tensor("attention_v", v[0,0,:5])
     dim_head //= heads
     q, k, v = map(lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2), (q, k, v))
-    out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
-    #ops.iree.trace_tensor("attention_out", out[0,0,:5])
+    out = torch.nn.functional.scaled_dot_product_attention(
+        q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
+    )
+    # ops.iree.trace_tensor("attention_out", out[0,0,:5])
     return out.transpose(1, 2).reshape(b, -1, heads * dim_head)
 
 
 class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, dtype=None, device=None):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        bias=True,
+        dtype=None,
+        device=None,
+    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
 
-        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, dtype=dtype, device=device)
+        self.fc1 = nn.Linear(
+            in_features, hidden_features, bias=bias, dtype=dtype, device=device
+        )
         self.act = act_layer
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, dtype=dtype, device=device)
+        self.fc2 = nn.Linear(
+            hidden_features, out_features, bias=bias, dtype=dtype, device=device
+        )
 
     def forward(self, x):
         x = self.fc1(x)
-        #ops.iree.trace_tensor("mlpfx", x[0,0,:5])
+        # ops.iree.trace_tensor("mlpfx", x[0,0,:5])
         x = self.act(x)
-        #ops.iree.trace_tensor("mlpact", x[0,0,:5])
+        # ops.iree.trace_tensor("mlpact", x[0,0,:5])
         x = self.fc2(x)
-        #ops.iree.trace_tensor("mlpanotherfc", x[0,0,:5])
+        # ops.iree.trace_tensor("mlpanotherfc", x[0,0,:5])
         return x
 
+
 def load_into(f, model, prefix, device, dtype=None):
     """Just a debugging-friendly hack to apply the weights in a safetensors file to the pytorch module."""
     for key in f.keys():
         if key.startswith(prefix) and not key.startswith("loss."):
-            path = key[len(prefix):].split(".")
+            path = key[len(prefix) :].split(".")
             obj = model
             for p in path:
                 if obj is list:
@@ -55,7 +72,9 @@ def load_into(f, model, prefix, device, dtype=None):
                 else:
                     obj = getattr(obj, p, None)
                     if obj is None:
-                        print(f"Skipping key '{key}' in safetensors file as '{p}' does not exist in python model")
+                        print(
+                            f"Skipping key '{key}' in safetensors file as '{p}' does not exist in python model"
+                        )
                         break
             if obj is None:
                 continue
@@ -69,6 +88,7 @@ def load_into(f, model, prefix, device, dtype=None):
                 print(f"Failed to load key '{key}' in safetensors file: {e}")
                 raise e
 
+
 #################################################################################################
 ### CLIP
 #################################################################################################
@@ -78,10 +98,18 @@ class CLIPAttention(torch.nn.Module):
     def __init__(self, embed_dim, heads, dtype, device):
         super().__init__()
         self.heads = heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.q_proj = nn.Linear(
+            embed_dim, embed_dim, bias=True, dtype=dtype, device=device
+        )
+        self.k_proj = nn.Linear(
+            embed_dim, embed_dim, bias=True, dtype=dtype, device=device
+        )
+        self.v_proj = nn.Linear(
+            embed_dim, embed_dim, bias=True, dtype=dtype, device=device
+        )
+        self.out_proj = nn.Linear(
+            embed_dim, embed_dim, bias=True, dtype=dtype, device=device
+        )
 
     def forward(self, x, mask=None):
         q = self.q_proj(x)
@@ -96,14 +124,30 @@ def forward(self, x, mask=None):
     "gelu": torch.nn.functional.gelu,
 }
 
+
 class CLIPLayer(torch.nn.Module):
-    def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+    def __init__(
+        self,
+        embed_dim,
+        heads,
+        intermediate_size,
+        intermediate_activation,
+        dtype,
+        device,
+    ):
         super().__init__()
         self.layer_norm1 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
         self.self_attn = CLIPAttention(embed_dim, heads, dtype, device)
         self.layer_norm2 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
-        #self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device)
-        self.mlp = Mlp(embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation], dtype=dtype, device=device)
+        # self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device)
+        self.mlp = Mlp(
+            embed_dim,
+            intermediate_size,
+            embed_dim,
+            act_layer=ACTIVATIONS[intermediate_activation],
+            dtype=dtype,
+            device=device,
+        )
 
     def forward(self, x, mask=None):
         x += self.self_attn(self.layer_norm1(x), mask)
@@ -112,9 +156,30 @@ def forward(self, x, mask=None):
 
 
 class CLIPEncoder(torch.nn.Module):
-    def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+    def __init__(
+        self,
+        num_layers,
+        embed_dim,
+        heads,
+        intermediate_size,
+        intermediate_activation,
+        dtype,
+        device,
+    ):
         super().__init__()
-        self.layers = torch.nn.ModuleList([CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device) for i in range(num_layers)])
+        self.layers = torch.nn.ModuleList(
+            [
+                CLIPLayer(
+                    embed_dim,
+                    heads,
+                    intermediate_size,
+                    intermediate_activation,
+                    dtype,
+                    device,
+                )
+                for i in range(num_layers)
+            ]
+        )
 
     def forward(self, x, mask=None, intermediate_output=None):
         if intermediate_output is not None:
@@ -129,10 +194,16 @@ def forward(self, x, mask=None, intermediate_output=None):
 
 
 class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
+    def __init__(
+        self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None
+    ):
         super().__init__()
-        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.token_embedding = torch.nn.Embedding(
+            vocab_size, embed_dim, dtype=dtype, device=device
+        )
+        self.position_embedding = torch.nn.Embedding(
+            num_positions, embed_dim, dtype=dtype, device=device
+        )
 
     def forward(self, input_tokens):
         return self.token_embedding(input_tokens) + self.position_embedding.weight
@@ -147,17 +218,36 @@ def __init__(self, config_dict, dtype, device):
         intermediate_activation = config_dict["hidden_act"]
         super().__init__()
         self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
-        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device)
+        self.encoder = CLIPEncoder(
+            num_layers,
+            embed_dim,
+            heads,
+            intermediate_size,
+            intermediate_activation,
+            dtype,
+            device,
+        )
         self.final_layer_norm = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
 
-    def forward(self, input_tokens, intermediate_output=None, final_layer_norm_intermediate=True):
+    def forward(
+        self, input_tokens, intermediate_output=None, final_layer_norm_intermediate=True
+    ):
         x = self.embeddings(input_tokens)
-        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
-        x, i = self.encoder(x, mask=causal_mask, intermediate_output=intermediate_output)
+        causal_mask = (
+            torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device)
+            .fill_(float("-inf"))
+            .triu_(1)
+        )
+        x, i = self.encoder(
+            x, mask=causal_mask, intermediate_output=intermediate_output
+        )
         x = self.final_layer_norm(x)
         if i is not None and final_layer_norm_intermediate:
             i = self.final_layer_norm(i)
-        pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
+        pooled_output = x[
+            torch.arange(x.shape[0], device=x.device),
+            input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),
+        ]
         return x, i, pooled_output
 
 
@@ -167,7 +257,9 @@ def __init__(self, config_dict, dtype, device):
         self.num_layers = config_dict["num_hidden_layers"]
         self.text_model = CLIPTextModel_(config_dict, dtype, device)
         embed_dim = config_dict["hidden_size"]
-        self.text_projection = nn.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
+        self.text_projection = nn.Linear(
+            embed_dim, embed_dim, bias=False, dtype=dtype, device=device
+        )
         self.text_projection.weight.copy_(torch.eye(embed_dim))
         self.dtype = dtype
 
@@ -182,12 +274,21 @@ def forward(self, *args, **kwargs):
         out = self.text_projection(x[2])
         return (x[0], x[1], out, x[2])
 
+
 class SDTokenizer:
-    def __init__(self, max_length=77, pad_with_end=True, tokenizer=None, has_start_token=True, pad_to_max_length=True, min_length=None):
+    def __init__(
+        self,
+        max_length=77,
+        pad_with_end=True,
+        tokenizer=None,
+        has_start_token=True,
+        pad_to_max_length=True,
+        min_length=None,
+    ):
         self.tokenizer = tokenizer
         self.max_length = max_length
         self.min_length = min_length
-        empty = self.tokenizer('')["input_ids"]
+        empty = self.tokenizer("")["input_ids"]
         if has_start_token:
             self.tokens_start = 1
             self.start_token = empty[0]
@@ -202,8 +303,7 @@ def __init__(self, max_length=77, pad_with_end=True, tokenizer=None, has_start_t
         self.inv_vocab = {v: k for k, v in vocab.items()}
         self.max_word_length = 8
 
-
-    def tokenize_with_weights(self, text:str):
+    def tokenize_with_weights(self, text: str):
         """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here. The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
         if self.pad_with_end:
             pad_token = self.end_token
@@ -212,10 +312,15 @@ def tokenize_with_weights(self, text:str):
         batch = []
         if self.start_token is not None:
             batch.append((self.start_token, 1.0))
-        to_tokenize = text.replace("\n", " ").split(' ')
+        to_tokenize = text.replace("\n", " ").split(" ")
         to_tokenize = [x for x in to_tokenize if x != ""]
         for word in to_tokenize:
-            batch.extend([(t, 1) for t in self.tokenizer(word)["input_ids"][self.tokens_start:-1]])
+            batch.extend(
+                [
+                    (t, 1)
+                    for t in self.tokenizer(word)["input_ids"][self.tokens_start : -1]
+                ]
+            )
         batch.append((self.end_token, 1.0))
         if self.pad_to_max_length:
             batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
@@ -236,7 +341,7 @@ def __init__(self):
         self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
         self.t5xxl = T5XXLTokenizer()
 
-    def tokenize_with_weights(self, text:str):
+    def tokenize_with_weights(self, text: str):
         out = {}
         out["g"] = self.clip_g.tokenize_with_weights(text)
         out["l"] = self.clip_l.tokenize_with_weights(text)
@@ -248,8 +353,8 @@ def tokenize_with_weights(self, text:str):
 
 class ClipTokenWeightEncoder:
     def encode_token_weights(self, token_weight_pairs):
-        #tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
-        tokens = token_weight_pairs[:,:,0]
+        # tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
+        tokens = token_weight_pairs[:, :, 0]
         out, pooled = self(tokens)
         if pooled is not None:
             first_pooled = pooled[0:1].cpu()
@@ -261,9 +366,22 @@ def encode_token_weights(self, token_weight_pairs):
 
 class SDClipModel(torch.nn.Module):
     """Uses the CLIP transformer encoder for text (from huggingface)"""
+
     LAYERS = ["last", "pooled", "hidden"]
-    def __init__(self, device="cpu", max_length=77, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=CLIPTextModel,
-                 special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, return_projected_pooled=True):
+
+    def __init__(
+        self,
+        device="cpu",
+        max_length=77,
+        layer="last",
+        layer_idx=None,
+        textmodel_json_config=None,
+        dtype=None,
+        model_class=CLIPTextModel,
+        special_tokens={"start": 49406, "end": 49407, "pad": 49407},
+        layer_norm_hidden_state=True,
+        return_projected_pooled=True,
+    ):
         super().__init__()
         assert layer in self.LAYERS
         self.transformer = model_class(textmodel_json_config, dtype, device)
@@ -282,14 +400,20 @@ def __init__(self, device="cpu", max_length=77, layer="last", layer_idx=None, te
             assert layer_idx is not None
             assert abs(layer_idx) < self.num_layers
             self.set_clip_options({"layer": layer_idx})
-        self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled)
+        self.options_default = (
+            self.layer,
+            self.layer_idx,
+            self.return_projected_pooled,
+        )
 
     def encode_token_weights(self, token_weight_pairs):
         pass
 
     def set_clip_options(self, options):
         layer_idx = options.get("layer", self.layer_idx)
-        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
+        self.return_projected_pooled = options.get(
+            "projected_pooled", self.return_projected_pooled
+        )
         if layer_idx is None or abs(layer_idx) > self.num_layers:
             self.layer = "last"
         else:
@@ -297,20 +421,28 @@ def set_clip_options(self, options):
             self.layer_idx = layer_idx
 
     def forward(self, token_weight_pairs):
-        #tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
-        tokens = token_weight_pairs[:,:,0]
-        #backup_embeds = self.transformer.get_input_embeddings()
-        #device = backup_embeds.weight.device
-        #tokens = torch.LongTensor(tokens).to(device)
-        outputs = self.transformer(tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state)
-        #self.transformer.set_input_embeddings(backup_embeds)
+        # tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
+        tokens = token_weight_pairs[:, :, 0]
+        # backup_embeds = self.transformer.get_input_embeddings()
+        # device = backup_embeds.weight.device
+        # tokens = torch.LongTensor(tokens).to(device)
+        outputs = self.transformer(
+            tokens,
+            intermediate_output=self.layer_idx,
+            final_layer_norm_intermediate=self.layer_norm_hidden_state,
+        )
+        # self.transformer.set_input_embeddings(backup_embeds)
         if self.layer == "last":
             z = outputs[0]
         else:
             z = outputs[1]
         pooled_output = None
         if len(outputs) >= 3:
-            if not self.return_projected_pooled and len(outputs) >= 4 and outputs[3] is not None:
+            if (
+                not self.return_projected_pooled
+                and len(outputs) >= 4
+                and outputs[3] is not None
+            ):
                 pooled_output = outputs[3].float()
             elif outputs[2] is not None:
                 pooled_output = outputs[2].float()
@@ -325,17 +457,37 @@ def forward(self, token_weight_pairs):
 
 class SDXLClipG(SDClipModel):
     """Wraps the CLIP-G model into the SD-CLIP-Model interface"""
-    def __init__(self, config, device="cpu", layer="penultimate", layer_idx=None, dtype=None):
+
+    def __init__(
+        self, config, device="cpu", layer="penultimate", layer_idx=None, dtype=None
+    ):
         if layer == "penultimate":
-            layer="hidden"
-            layer_idx=-2
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)
+            layer = "hidden"
+            layer_idx = -2
+        super().__init__(
+            device=device,
+            layer=layer,
+            layer_idx=layer_idx,
+            textmodel_json_config=config,
+            dtype=dtype,
+            special_tokens={"start": 49406, "end": 49407, "pad": 0},
+            layer_norm_hidden_state=False,
+        )
 
 
 class T5XXLModel(SDClipModel):
     """Wraps the T5-XXL model into the SD-CLIP-Model interface for convenience"""
+
     def __init__(self, config, device="cpu", layer="last", layer_idx=None, dtype=None):
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=T5)
+        super().__init__(
+            device=device,
+            layer=layer,
+            layer_idx=layer_idx,
+            textmodel_json_config=config,
+            dtype=dtype,
+            special_tokens={"end": 1, "pad": 0},
+            model_class=T5,
+        )
 
 
 #################################################################################################
@@ -345,14 +497,24 @@ def __init__(self, config, device="cpu", layer="last", layer_idx=None, dtype=Non
 
 class T5XXLTokenizer(SDTokenizer):
     """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
+
     def __init__(self):
-        super().__init__(pad_with_end=False, tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"), has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
+        super().__init__(
+            pad_with_end=False,
+            tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"),
+            has_start_token=False,
+            pad_to_max_length=False,
+            max_length=99999999,
+            min_length=77,
+        )
 
 
 class T5LayerNorm(torch.nn.Module):
     def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
         super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
+        self.weight = torch.nn.Parameter(
+            torch.ones(hidden_size, dtype=dtype, device=device)
+        )
         self.variance_epsilon = eps
 
     def forward(self, x):
@@ -390,7 +552,9 @@ def forward(self, x):
 
 
 class T5Attention(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device):
+    def __init__(
+        self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device
+    ):
         super().__init__()
         # Mesh TensorFlow initialization to avoid scaling before softmax
         self.q = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
@@ -402,10 +566,14 @@ def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dty
         if relative_attention_bias:
             self.relative_attention_num_buckets = 32
             self.relative_attention_max_distance = 128
-            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)
+            self.relative_attention_bias = torch.nn.Embedding(
+                self.relative_attention_num_buckets, self.num_heads, device=device
+            )
 
     @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+    def _relative_position_bucket(
+        relative_position, bidirectional=True, num_buckets=32, max_distance=128
+    ):
         """
         Adapted from Mesh Tensorflow:
         https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
@@ -432,7 +600,9 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets
             relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
             relative_position = torch.abs(relative_position)
         else:
-            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+            relative_position = -torch.min(
+                relative_position, torch.zeros_like(relative_position)
+            )
         # now relative_position is in the range [0, inf)
         # half of the buckets are for exact increments in positions
         max_exact = num_buckets // 2
@@ -443,23 +613,38 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets
             / math.log(max_distance / max_exact)
             * (num_buckets - max_exact)
         ).to(torch.long)
-        relative_position_if_large = torch.min(relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1))
-        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, num_buckets - 1),
+        )
+        relative_buckets += torch.where(
+            is_small, relative_position, relative_position_if_large
+        )
         return relative_buckets
 
     def compute_bias(self, query_length, key_length, device):
         """Compute binned relative position bias"""
-        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
-        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
-        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[
+            :, None
+        ]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[
+            None, :
+        ]
+        relative_position = (
+            memory_position - context_position
+        )  # shape (query_length, key_length)
         relative_position_bucket = self._relative_position_bucket(
             relative_position,  # shape (query_length, key_length)
             bidirectional=True,
             num_buckets=self.relative_attention_num_buckets,
             max_distance=self.relative_attention_max_distance,
         )
-        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        values = self.relative_attention_bias(
+            relative_position_bucket
+        )  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(
+            0
+        )  # shape (1, num_heads, query_length, key_length)
         return values
 
     def forward(self, x, past_bias=None):
@@ -470,14 +655,27 @@ def forward(self, x, past_bias=None):
             past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)
         if past_bias is not None:
             mask = past_bias
-        out = attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask)
+        out = attention(
+            q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask
+        )
         return self.o(out), past_bias
 
 
 class T5LayerSelfAttention(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+    def __init__(
+        self,
+        model_dim,
+        inner_dim,
+        ff_dim,
+        num_heads,
+        relative_attention_bias,
+        dtype,
+        device,
+    ):
         super().__init__()
-        self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device)
+        self.SelfAttention = T5Attention(
+            model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device
+        )
         self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
 
     def forward(self, x, past_bias=None):
@@ -487,10 +685,29 @@ def forward(self, x, past_bias=None):
 
 
 class T5Block(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+    def __init__(
+        self,
+        model_dim,
+        inner_dim,
+        ff_dim,
+        num_heads,
+        relative_attention_bias,
+        dtype,
+        device,
+    ):
         super().__init__()
         self.layer = torch.nn.ModuleList()
-        self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device))
+        self.layer.append(
+            T5LayerSelfAttention(
+                model_dim,
+                inner_dim,
+                ff_dim,
+                num_heads,
+                relative_attention_bias,
+                dtype,
+                device,
+            )
+        )
         self.layer.append(T5LayerFF(model_dim, ff_dim, dtype, device))
 
     def forward(self, x, past_bias=None):
@@ -500,13 +717,38 @@ def forward(self, x, past_bias=None):
 
 
 class T5Stack(torch.nn.Module):
-    def __init__(self, num_layers, model_dim, inner_dim, ff_dim, num_heads, vocab_size, dtype, device):
+    def __init__(
+        self,
+        num_layers,
+        model_dim,
+        inner_dim,
+        ff_dim,
+        num_heads,
+        vocab_size,
+        dtype,
+        device,
+    ):
         super().__init__()
         self.embed_tokens = torch.nn.Embedding(vocab_size, model_dim, device=device)
-        self.block = torch.nn.ModuleList([T5Block(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device) for i in range(num_layers)])
+        self.block = torch.nn.ModuleList(
+            [
+                T5Block(
+                    model_dim,
+                    inner_dim,
+                    ff_dim,
+                    num_heads,
+                    relative_attention_bias=(i == 0),
+                    dtype=dtype,
+                    device=device,
+                )
+                for i in range(num_layers)
+            ]
+        )
         self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
 
-    def forward(self, input_ids, intermediate_output=None, final_layer_norm_intermediate=True):
+    def forward(
+        self, input_ids, intermediate_output=None, final_layer_norm_intermediate=True
+    ):
         intermediate = None
         x = self.embed_tokens(input_ids)
         past_bias = None
@@ -524,7 +766,16 @@ class T5(torch.nn.Module):
     def __init__(self, config_dict, dtype, device):
         super().__init__()
         self.num_layers = config_dict["num_layers"]
-        self.encoder = T5Stack(self.num_layers, config_dict["d_model"], config_dict["d_model"], config_dict["d_ff"], config_dict["num_heads"], config_dict["vocab_size"], dtype, device)
+        self.encoder = T5Stack(
+            self.num_layers,
+            config_dict["d_model"],
+            config_dict["d_model"],
+            config_dict["d_ff"],
+            config_dict["num_heads"],
+            config_dict["vocab_size"],
+            dtype,
+            device,
+        )
         self.dtype = dtype
 
     def get_input_embeddings(self):

From ac5fd3fdafa51cf4b397e6c3c85df43d9c641b00 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Fri, 14 Jun 2024 15:01:10 -0500
Subject: [PATCH 120/174] Update mmdit runner inputs, small attn reproducer,
 pad attention flag

---
 .../sd3_inference/sd3_cmd_opts.py             |  7 ++
 .../custom_models/sd3_inference/sd3_mmdit.py  | 96 +++++++++++++++++++
 .../sd3_inference/sd3_mmdit_runner.py         |  9 +-
 .../custom_models/sd_inference/utils.py       |  2 +-
 4 files changed, 109 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
index e072fad2c..2697a1e00 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -284,6 +284,13 @@ def is_valid_file(arg):
     default="SD3_output.png",
     help="Path to output file for generated images.",
 )
+p.add_argument(
+    "--attn_repro",
+    default=False,
+    action="store_true",
+    help="Just compile attention reproducer for mmdit.",
+)
+
 
 ##############################################################################
 # IREE Compiler Options
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index 85414a1e1..07a2f1e2a 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -52,7 +52,87 @@ def forward(
             return_dict=False,
         )[0]
         return noise_pred
+    
+class MMDiTAttention(torch.nn.Module):
+    def __init__(
+            self,
+    ):
+        super().__init__()
+
+    def forward(self, q, k, v):
+        return torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, dropout_p=0.0, is_causal=False
+        )
+
+
+@torch.no_grad()
+def export_attn(
+    precision="fp16",
+    device="cpu",
+    target_triple="x86_64-unknown-linux-gnu",
+    ireec_flags="",
+    compile_to="torch",
+    decomp_attn=False,
+    attn_spec=None,
+):
+    dtype = torch.float16 if precision == "fp16" else torch.float32
+    qkv_shape = (2, 24, 4250, 64)
+    attn_module = MMDiTAttention()
+    safe_name = "attn_repro_" + precision + "_" + target_triple
+    if decomp_attn == True:
+        safe_name += "_decomp"
+
+    if dtype == torch.float16:
+        attn_module = attn_module.half()
+    
+    example_qkv = [
+        torch.empty(qkv_shape, dtype=dtype),
+        torch.empty(qkv_shape, dtype=dtype),
+        torch.empty(qkv_shape, dtype=dtype),
+    ]
+
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+        fxb = FxProgramsBuilder(attn_module)
 
+        @fxb.export_program(
+            args=(example_qkv,),
+        )
+        def _forward(
+            module,
+            inputs,
+        ):
+            return module.forward(*inputs)
+
+        class CompiledAttn(CompiledModule):
+            run_forward = _forward
+
+        inst = CompiledAttn(context=Context(), import_to="IMPORT")
+
+        module_str = str(CompiledModule.get_mlir_module(inst))
+
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        vmfb_path = utils.compile_to_vmfb(
+            module_str,
+            device,
+            target_triple,
+            ireec_flags,
+            safe_name,
+            return_path=True,
+            attn_spec=attn_spec,
+        )
+    return vmfb_path
 
 @torch.no_grad()
 def export_mmdit_model(
@@ -183,6 +263,22 @@ class CompiledMmdit(CompiledModule):
     logging.basicConfig(level=logging.DEBUG)
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
 
+    if args.attn_repro:
+        mod_str = export_attn(
+            args.precision,
+            args.device,
+            args.iree_target_triple,
+            args.ireec_flags,
+            args.compile_to,
+            args.decomp_attn,
+            attn_spec=args.attn_spec,
+        )
+        if args.compile_to != "vmfb":
+            safe_name = "attn_repro_" + args.precision
+            with open(f"{safe_name}.mlir", "w+") as f:
+                f.write(mod_str)
+            print("Saved to", safe_name + ".mlir")
+        exit()
     if args.input_mlir:
         mmdit_model = None
     else:
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
index d730e140e..599132480 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
@@ -69,14 +69,15 @@ def run_diffusers_mmdit(
         dtype = torch.float16
     else:
         dtype = torch.float32
-
+    
+    batch_size = args.batch_size * 2 #do classifier free guidance
     hidden_states = torch.randn(
-        (args.batch_size, 16, args.height // 8, args.width // 8), dtype=dtype
+        (batch_size, 16, args.height // 8, args.width // 8), dtype=dtype
     )
     encoder_hidden_states = torch.randn(
-        (args.batch_size, args.max_length * 2, 4096), dtype=dtype
+        (batch_size, args.max_length * 2, 4096), dtype=dtype
     )
-    pooled_projections = torch.randn((args.batch_size, 2048), dtype=dtype)
+    pooled_projections = torch.randn((batch_size, 2048), dtype=dtype)
     timestep = torch.tensor([0], dtype=dtype)
 
     turbine_output = run_mmdit_turbine(
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index ce48dff33..52c980903 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -57,7 +57,7 @@
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
-        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics))",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics, iree-linalg-ext-pad-attention{pad-to-multiple-of=0,64,0,32,0}))",
     ],
     "unet": [""],
     "clip": [""],

From cd50bbeea01e83ffd2459804938e2eb65c860032 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sun, 16 Jun 2024 22:23:23 -0500
Subject: [PATCH 121/174] SD3 small tweaks for numerics

---
 .../sd3_inference/sd3_pipeline.py             | 59 ++++++++++++++++---
 .../sd3_inference/sd3_schedulers.py           |  4 +-
 .../sd3_inference/sd3_text_encoders.py        |  2 +-
 3 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index cd629f5ad..4aed8a962 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -445,16 +445,18 @@ def generate_images(
         numpy_images = []
 
         for i in range(batch_count):
-            generator = torch.random.manual_seed(seed + i)
+            generator = torch.Generator().manual_seed(int(seed))
+            shape = (
+                self.batch_size,
+                16,
+                self.height // 8,
+                self.width // 8,
+            )
             rand_sample = torch.randn(
-                (
-                    self.batch_size,
-                    16,
-                    self.height // 8,
-                    self.width // 8,
-                ),
+                shape,
                 generator=generator,
-                dtype=torch_dtype,
+                dtype=torch.float32,
+                layout=torch.strided,
             )
             samples.append(
                 ireert.asdevicearray(
@@ -499,7 +501,6 @@ def generate_images(
         prompt_embeds, pooled_prompt_embeds = self.runners[
             "text_encoders"
         ].ctx.modules.compiled_text_encoder["encode_tokens"](*text_encoders_inputs)
-
         encode_prompts_end = time.time()
 
         for i in range(batch_count):
@@ -617,11 +618,51 @@ def generate_images(
                 image.save(img_path)
                 print(img_path, "saved")
         return
+    
+def run_diffusers_cpu(
+    hf_model_name,
+    prompt,
+    negative_prompt,
+    guidance_scale,
+    seed,
+    height,
+    width,
+    num_inference_steps,
+):
+    from diffusers import StableDiffusion3Pipeline
+
+    pipe = StableDiffusion3Pipeline.from_pretrained(hf_model_name, torch_dtype=torch.float32)
+    pipe = pipe.to("cpu")
+    generator = torch.Generator().manual_seed(int(seed))
+
+    image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        height=height,
+        width=width,
+        generator=generator,
+    ).images[0]
+    timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
+    image.save(f"diffusers_reference_output_{timestamp}.png")
 
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
 
+    if args.compare_vs_torch:
+        run_diffusers_cpu(
+            args.hf_model_name,
+            args.prompt,
+            args.negative_prompt,
+            args.guidance_scale,
+            args.seed,
+            args.height,
+            args.width,
+            args.num_inference_steps,
+        )
+        exit()
     map = empty_pipe_dict
     mlirs = copy.deepcopy(map)
     vmfbs = copy.deepcopy(map)
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 0d4078605..607689b3a 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -66,9 +66,9 @@ def __init__(
     def initialize(self, sample):
         step_count = torch.tensor(len(self.timesteps))
         timesteps = self.model.timesteps
-        # ops.trace_tensor("timesteps", self.timesteps)
+        ops.trace_tensor("sample", sample[:,:,0,0])
         return (
-            sample.type(self.dtype),
+            sample,
             step_count,
             timesteps.type(torch.float32),
         )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
index 89bee3cb1..97d41caf4 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -105,7 +105,7 @@ def forward(self, tokens_g, tokens_l, tokens_t5xxl, neg_g, neg_l, neg_t5):
         neg_cond, neg_cond_pool = self.get_cond(neg_l, neg_g, neg_t5)
 
         prompt_embeds = torch.cat([neg_cond, conditioning], dim=0)
-        pooled_prompt_embeds = torch.cat([cond_pool, neg_cond_pool], dim=0)
+        pooled_prompt_embeds = torch.cat([neg_cond_pool, cond_pool], dim=0)
 
         return prompt_embeds, pooled_prompt_embeds
 

From 81ee0936cf1b18f79fcf66a492d30307a7e7bfd4 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 01:29:41 -0500
Subject: [PATCH 122/174] Attn debugging tools

---
 .../sd3_inference/sd3_mmdit_runner.py         | 66 +++++++++++++++++++
 .../sd3_inference/sd3_schedulers.py           |  5 +-
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
index 599132480..50a5fb285 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
@@ -57,6 +57,49 @@ def run_diffusers_mmdit(
 
     return noise_pred.numpy()
 
+def run_attn_turbine(q, k, v, args):
+    attn_runner = vmfbRunner(
+        args.device,
+        args.vmfb_path,
+        None,
+    )
+    iree_inputs = [
+        ireert.asdevicearray(attn_runner.config.device, q),
+        ireert.asdevicearray(attn_runner.config.device, k),
+        ireert.asdevicearray(attn_runner.config.device, v),
+    ]
+    attn_output = attn_runner.ctx.modules.compiled_attn["run_forward"](
+        *iree_inputs
+    ).to_host()
+    return attn_output
+
+@torch.no_grad()
+def run_attn_torch(q, k, v, args):
+    from turbine_models.custom_models.sd3_inference.sd3_mmdit import MMDiTAttention
+
+    mmdit_attn = MMDiTAttention()
+    attn_output = mmdit_attn.forward(
+        torch.tensor(q, dtype=torch.float32),
+        torch.tensor(k, dtype=torch.float32),
+        torch.tensor(v, dtype=torch.float32),
+    )
+
+    return attn_output.numpy()
+
+def find_errs(turbine_output, torch_output, dim=[], failed_dims=[], errs=[]):
+    if not np.allclose(turbine_output, torch_output, rtol=4e-2, atol=4e-2):
+        if turbine_output.ndim > 0:
+            orig_dim = dim
+            for idx, i in enumerate(torch_output):
+                dim = [*orig_dim, idx]
+                try:
+                    np.testing.assert_allclose(turbine_output[idx], torch_output[idx], rtol=4e-2, atol=4e-2)
+                except Exception as e:
+                    err = np.abs(turbine_output[idx] - torch_output[idx])
+                    failed_dims.append(dim)
+                    errs.append([err, turbine_output[idx], torch_output[idx]])
+                    failed_dims, errs = find_errs(turbine_output[idx], torch_output[idx], dim, failed_dims, errs)
+    return (failed_dims, errs)
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
@@ -69,6 +112,29 @@ def run_diffusers_mmdit(
         dtype = torch.float16
     else:
         dtype = torch.float32
+
+    if args.attn_repro:
+        qkv_shape = (2, 24, 4250, 64)
+        example_qkv = [
+            np.load("q.npy").astype(np.float16),
+            np.load("k.npy").astype(np.float16),
+            np.load("v.npy").astype(np.float16),
+        ]
+        turbine_output = run_attn_turbine(
+            *example_qkv,
+            args,
+        )
+        torch_output = run_attn_torch(*example_qkv, args).astype(np.float16)
+        np.save("turbine_attn_output.npy", turbine_output)
+        np.save("torch_attn_output.npy", torch_output)
+        failed_dims, errs = find_errs(turbine_output, torch_output)
+        for idx, dim in enumerate(failed_dims):
+            if len(dim) == len(torch_output.shape):
+                print("Failed dimension: ", dim, " with error: ", errs[idx][0])
+                print("Turbine output: ", errs[idx][1])
+                print("Torch output: ", errs[idx][2])
+        print(torch_output.shape)
+        exit()
     
     batch_size = args.batch_size * 2 #do classifier free guidance
     hidden_states = torch.randn(
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 607689b3a..06c23ef1c 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -93,8 +93,9 @@ def step(self, noise_pred, t, sample, guidance_scale, i):
         sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
         return sample.type(self.dtype)
 
-
-class SharkSchedulerCPUWrapper:
+# Wraps a diffusers scheduler running on native pytorch+cpu.
+# This allows us to use it interchangeably with compiled schedulers in our pipeline(s).
+class TorchCPUFlowSchedulerCompat:
     @torch.no_grad()
     def __init__(
         self, scheduler, batch_size, num_inference_steps, dest_device, latents_dtype

From b793686c64039e0b74598c64a9d28e41e49dc170 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 07:48:31 -0500
Subject: [PATCH 123/174] Attn debugging, piping for multi-device in sd3

---
 .../sd3_inference/sd3_mmdit_runner.py         |  8 +--
 .../sd3_inference/sd3_pipeline.py             | 49 ++++++++++++++-----
 .../custom_models/sd_inference/utils.py       | 11 ++++-
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  3 +-
 4 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
index 50a5fb285..f547b59bb 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
@@ -110,15 +110,17 @@ def find_errs(turbine_output, torch_output, dim=[], failed_dims=[], errs=[]):
 
     if args.precision == "fp16":
         dtype = torch.float16
+        np_dtype = np.float16
     else:
         dtype = torch.float32
+        np_dtype = np.float32
 
     if args.attn_repro:
         qkv_shape = (2, 24, 4250, 64)
         example_qkv = [
-            np.load("q.npy").astype(np.float16),
-            np.load("k.npy").astype(np.float16),
-            np.load("v.npy").astype(np.float16),
+            np.load("q.npy").astype(np_dtype),
+            np.load("k.npy").astype(np_dtype),
+            np.load("v.npy").astype(np_dtype),
         ]
         turbine_output = run_attn_turbine(
             *example_qkv,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 4aed8a962..c16c22c3c 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -68,8 +68,8 @@ def __init__(
         max_length: int,
         batch_size: int,
         num_inference_steps: int,
-        device: str,
-        iree_target_triple: str,
+        device: str | dict[str],
+        iree_target_triple: str | dict[str],
         ireec_flags: dict = EMPTY_FLAGS,
         attn_spec: str = None,
         decomp_attn: bool = False,
@@ -89,7 +89,25 @@ def __init__(
         self.max_length = max_length
         self.batch_size = batch_size
         self.num_inference_steps = num_inference_steps
-        self.device = device
+        self.devices = {}
+        if isinstance(self.device, dict):
+            assert isinstance(iree_target_triple, dict), "Device and target triple must be both dicts or both strings."
+            self.devices["clip"] = {
+                "device": device["clip"],
+                "target": iree_target_triple["clip"]
+            }
+            self.devices["mmdit"] = {
+                "device": device["mmdit"],
+                "target": iree_target_triple["mmdit"]
+            }
+            self.devices["vae"] = {
+                "device": device["vae"],
+                "target": iree_target_triple["vae"]
+            }
+        else:
+            self.devices["clip"] = device
+            self.devices["mmdit"] = device
+            self.devices["vae"] = device
         self.iree_target_triple = iree_target_triple
         self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
         self.attn_spec = attn_spec
@@ -291,8 +309,8 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     mmdit_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["mmdit"]["device"],
+                    self.devices["mmdit"]["target"],
                     self.ireec_flags["mmdit"],
                     self.decomp_attn,
                     exit_on_vmfb=False,
@@ -313,8 +331,8 @@ def export_submodel(
                     self.num_inference_steps,
                     self.precision,
                     "vmfb",
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["mmdit"]["device"],
+                    self.devices["mmdit"]["target"],
                     self.ireec_flags["scheduler"],
                     exit_on_vmfb=False,
                     pipeline_dir=self.pipeline_dir,
@@ -336,8 +354,8 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     vae_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["vae"]["device"],
+                    self.devices["vae"]["target"],
                     self.ireec_flags["vae"],
                     self.vae_decomp_attn,
                     exit_on_vmfb=False,
@@ -357,8 +375,8 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     text_encoders_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["clip"]["device"],
+                    self.devices["clip"]["target"],
                     self.ireec_flags["clip"],
                     exit_on_vmfb=False,
                     pipeline_dir=self.pipeline_dir,
@@ -374,10 +392,15 @@ def load_pipeline(
         self,
         vmfbs: dict,
         weights: dict,
-        rt_device: str = "local-task",
+        rt_device: str | dict[str],
         compiled_pipeline: bool = False,
         split_scheduler: bool = True,
+        extra_device_args: dict = {},
     ):
+        if "npu_delegate_path" in extra_device_args.keys():
+            delegate = extra_device_args["npu_delegate_path"]
+        else:
+            delegate = None
         self.runners = {}
         runners = {}
         load_start = time.time()
@@ -399,7 +422,7 @@ def load_pipeline(
         runners["vae"] = vmfbRunner(
             rt_device,
             vmfbs["vae"],
-            weights["vae"],
+            weights["vae"], 
         )
         vae_loaded = time.time()
         print("\n[LOG] VAE Decode loaded in ", vae_loaded - sched_loaded, "sec")
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 52c980903..4489141d6 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -66,7 +66,6 @@
 }
 znver4_flags = {
     "all": [
-        # "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))",
         "--iree-llvmcpu-target-cpu=znver4",
         "--iree-opt-const-eval=false",
         "--iree-llvmcpu-enable-ukernels=mmt4d,pack,unpack",
@@ -74,6 +73,12 @@
         "--iree-opt-const-expr-max-size-increase-threshold=1000000000000000",
         "--iree-flow-enable-fuse-padding-into-linalg-consumer-ops",
     ],
+    "bf16": [
+        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-global-opt-demote-contraction-inputs-to-bf16))",
+    ],
+    "winograd": [
+        "--iree-preprocessing-pass-pipeline=builtin.module(util.func(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true},iree-global-opt-demote-contraction-inputs-to-bf16))"
+    ],
 }
 
 
@@ -182,10 +187,12 @@ def compile_to_vmfb(
     if attn_spec in ["default", "mfma"]:
         attn_spec = get_mfma_spec_path(target_triple, os.path.dirname(safe_name))
         flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
-    elif attn_spec in ["wmma"] or "gfx11" in target_triple:
+    elif attn_spec in ["wmma"] or ("gfx11" in target_triple and not attn_spec):
         attn_spec = get_wmma_spec_path(target_triple, os.path.dirname(safe_name))
         if attn_spec:
             flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
+    elif attn_spec and attn_spec != "None":
+        flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
 
     for i, flag in enumerate(ireec_flags):
         k = flag.strip().split("=")[0]
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 514c73118..2edc2866c 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -773,6 +773,7 @@ def generate_images(
                     ](samples[i], prompt_embeds, add_text_embeds, guidance_scale)
 
                 vae_start = time.time()
+                np.save("latents_winter_cat.npy", latents.to_host().astype(np.float32))
                 vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
                     latents
                 )
@@ -780,7 +781,7 @@ def generate_images(
                 pipe_end = time.time()
 
                 image = vae_out.to_host()
-
+                np.save("image_winter_cat.npy", image.astype(np.float32))
                 numpy_images.append(image)
                 print("Batch #", i + 1, "\n")
                 print(

From 77546095def7216cd46a225157fb7336b9b508f2 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 09:38:26 -0500
Subject: [PATCH 124/174] Fixes for multi-device (SD3)

---
 .../sd3_inference/sd3_cmd_opts.py             |  42 ++++++
 .../sd3_inference/sd3_pipeline.py             | 123 +++++++++++-------
 .../custom_models/sd_inference/utils.py       |  24 ++++
 3 files changed, 144 insertions(+), 45 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
index 2697a1e00..aec606e3e 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -177,6 +177,48 @@ def is_valid_file(arg):
     help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
 )
 
+p.add_argument(
+    "--clip_device",
+    default=None,
+    type=str,
+    help="Device to run CLIP on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--mmdit_device",
+    default=None,
+    type=str,
+    help="Device to run MMDiT on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--vae_device",
+    default=None,
+    type=str,
+    help="Device to run VAE on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--clip_target",
+    default=None,
+    type=str,
+    help="IREE target for CLIP compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
+p.add_argument(
+    "--mmdit_target",
+    default=None,
+    type=str,
+    help="IREE target for mmdit compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
+p.add_argument(
+    "--vae_target",
+    default=None,
+    type=str,
+    help="IREE target for vae compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
 ##############################################################################
 # SD3 Modelling Options
 #    These options are used to control model defining parameters for SD3.
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index c16c22c3c..ed71a7f9a 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -25,27 +25,11 @@
 import copy
 from datetime import datetime as dt
 
-device_list = [
-    "cpu",
-    "vulkan",
-    "cuda",
-    "rocm",
-]
-
-rt_device_list = [
-    "local-task",
-    "local-sync",
-    "vulkan",
-    "cuda",
-    "rocm",
-    "hip",
-]
-
 empty_pipe_dict = {
-    "vae": None,
-    "text_encoders": None,
+    "clip": None,
     "mmdit": None,
     "scheduler": None,
+    "vae": None,
 }
 
 EMPTY_FLAGS = {
@@ -90,24 +74,40 @@ def __init__(
         self.batch_size = batch_size
         self.num_inference_steps = num_inference_steps
         self.devices = {}
-        if isinstance(self.device, dict):
+        if isinstance(device, dict):
             assert isinstance(iree_target_triple, dict), "Device and target triple must be both dicts or both strings."
             self.devices["clip"] = {
                 "device": device["clip"],
+                "driver": utils.iree_device_map(device["clip"]),
                 "target": iree_target_triple["clip"]
             }
             self.devices["mmdit"] = {
                 "device": device["mmdit"],
+                "driver": utils.iree_device_map(device["mmdit"]),
                 "target": iree_target_triple["mmdit"]
             }
             self.devices["vae"] = {
                 "device": device["vae"],
+                "driver": utils.iree_device_map(device["vae"]),
                 "target": iree_target_triple["vae"]
             }
         else:
-            self.devices["clip"] = device
-            self.devices["mmdit"] = device
-            self.devices["vae"] = device
+            assert isinstance(iree_target_triple, str), "Device and target triple must be both dicts or both strings."
+            self.devices["clip"] = {
+                "device": device,
+                "driver": utils.iree_device_map(device),
+                "target": iree_target_triple
+            }
+            self.devices["mmdit"] = {
+                "device": device,
+                "driver": utils.iree_device_map(device),
+                "target": iree_target_triple
+            }
+            self.devices["vae"] = {
+                "device": device,
+                "driver": utils.iree_device_map(device),
+                "target": iree_target_triple
+            }
         self.iree_target_triple = iree_target_triple
         self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
         self.attn_spec = attn_spec
@@ -176,6 +176,9 @@ def is_prepared(self, vmfbs, weights):
                 val = None
                 default_filepath = None
                 continue
+            elif key == "clip":
+                val = "text_encoders"
+                default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
             else:
                 val = vmfbs[key]
                 default_filepath = os.path.join(self.pipeline_dir, key + ".vmfb")
@@ -197,7 +200,7 @@ def is_prepared(self, vmfbs, weights):
             default_name = os.path.join(
                 self.external_weights_dir, w_key + "." + self.external_weights
             )
-            if w_key == "text_encoders":
+            if w_key == "clip":
                 default_name = os.path.join(
                     self.external_weights_dir, f"sd3_clip_fp16.irpa"
                 )
@@ -287,7 +290,7 @@ def export_submodel(
         if weights_only:
             input_mlir = {
                 "vae": None,
-                "text_encoders": None,
+                "clip": None,
                 "mmdit": None,
                 "scheduler": None,
             }
@@ -366,7 +369,7 @@ def export_submodel(
                 )
                 del vae_torch
                 return vae_vmfb, vae_external_weight_path
-            case "text_encoders":
+            case "clip":
                 _, text_encoders_vmfb = sd3_text_encoders.export_text_encoders(
                     self.hf_model_name,
                     None,
@@ -380,7 +383,7 @@ def export_submodel(
                     self.ireec_flags["clip"],
                     exit_on_vmfb=False,
                     pipeline_dir=self.pipeline_dir,
-                    input_mlir=input_mlir["text_encoders"],
+                    input_mlir=input_mlir["clip"],
                     attn_spec=self.attn_spec,
                     output_batchsize=self.batch_size,
                 )
@@ -392,7 +395,6 @@ def load_pipeline(
         self,
         vmfbs: dict,
         weights: dict,
-        rt_device: str | dict[str],
         compiled_pipeline: bool = False,
         split_scheduler: bool = True,
         extra_device_args: dict = {},
@@ -401,11 +403,12 @@ def load_pipeline(
             delegate = extra_device_args["npu_delegate_path"]
         else:
             delegate = None
+
         self.runners = {}
         runners = {}
         load_start = time.time()
         runners["pipe"] = vmfbRunner(
-            rt_device,
+            self.devices["mmdit"]["driver"],
             vmfbs["mmdit"],
             weights["mmdit"],
         )
@@ -413,23 +416,24 @@ def load_pipeline(
         print("\n[LOG] MMDiT loaded in ", unet_loaded - load_start, "sec")
 
         runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
-            rt_device,
+            self.devices["mmdit"]["driver"],
             vmfbs["scheduler"],
         )
 
         sched_loaded = time.time()
         print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
         runners["vae"] = vmfbRunner(
-            rt_device,
+            self.devices["vae"]["driver"],
             vmfbs["vae"],
-            weights["vae"], 
+            weights["vae"],
+            extra_plugin=delegate,
         )
         vae_loaded = time.time()
         print("\n[LOG] VAE Decode loaded in ", vae_loaded - sched_loaded, "sec")
-        runners["text_encoders"] = vmfbRunner(
-            rt_device,
-            vmfbs["text_encoders"],
-            weights["text_encoders"],
+        runners["clip"] = vmfbRunner(
+            self.devices["clip"]["driver"],
+            vmfbs["clip"],
+            weights["clip"],
         )
         clip_loaded = time.time()
         print("\n[LOG] Text Encoders loaded in ", clip_loaded - vae_loaded, "sec")
@@ -500,29 +504,29 @@ def generate_images(
         uncond_input_ids_list = list(uncond_input_ids_dict.values())
         text_encoders_inputs = [
             ireert.asdevicearray(
-                self.runners["text_encoders"].config.device, text_input_ids_list[0]
+                self.runners["clip"].config.device, text_input_ids_list[0]
             ),
             ireert.asdevicearray(
-                self.runners["text_encoders"].config.device, text_input_ids_list[1]
+                self.runners["clip"].config.device, text_input_ids_list[1]
             ),
             ireert.asdevicearray(
-                self.runners["text_encoders"].config.device, text_input_ids_list[2]
+                self.runners["clip"].config.device, text_input_ids_list[2]
             ),
             ireert.asdevicearray(
-                self.runners["text_encoders"].config.device, uncond_input_ids_list[0]
+                self.runners["clip"].config.device, uncond_input_ids_list[0]
             ),
             ireert.asdevicearray(
-                self.runners["text_encoders"].config.device, uncond_input_ids_list[1]
+                self.runners["clip"].config.device, uncond_input_ids_list[1]
             ),
             ireert.asdevicearray(
-                self.runners["text_encoders"].config.device, uncond_input_ids_list[2]
+                self.runners["clip"].config.device, uncond_input_ids_list[2]
             ),
         ]
 
         # Tokenize prompt and negative prompt.
         encode_prompts_start = time.time()
         prompt_embeds, pooled_prompt_embeds = self.runners[
-            "text_encoders"
+            "clip"
         ].ctx.modules.compiled_text_encoder["encode_tokens"](*text_encoders_inputs)
         encode_prompts_end = time.time()
 
@@ -690,6 +694,34 @@ def run_diffusers_cpu(
     mlirs = copy.deepcopy(map)
     vmfbs = copy.deepcopy(map)
     weights = copy.deepcopy(map)
+
+    if any(x for x in [args.clip_device, args.mmdit_device, args.vae_device]):
+        assert all(
+            x for x in [args.clip_device, args.mmdit_device, args.vae_device]
+        ), "Please specify device for all submodels or pass --device for all submodels."
+        assert all(
+            x for x in [args.clip_target, args.mmdit_target, args.vae_target]
+        ), "Please specify target triple for all submodels or pass --iree_target_triple for all submodels."
+        args.device = "hybrid"
+        args.iree_target_triple = "_".join([args.clip_target, args.mmdit_target, args.vae_target])
+    else:
+        args.clip_device = args.device
+        args.mmdit_device = args.device
+        args.vae_device = args.device
+        args.clip_target = args.iree_target_triple
+        args.mmdit_target = args.iree_target_triple
+        args.vae_target = args.iree_target_triple
+
+    devices = {
+        "clip": args.clip_device,
+        "mmdit": args.mmdit_device,
+        "vae": args.vae_device,
+    }
+    targets = {
+        "clip": args.clip_target,
+        "mmdit": args.mmdit_target,
+        "vae": args.vae_target,
+    }
     ireec_flags = {
         "clip": args.ireec_flags + args.clip_flags,
         "mmdit": args.ireec_flags + args.unet_flags,
@@ -705,6 +737,7 @@ def run_diffusers_cpu(
             str(args.max_length),
             args.precision,
             args.device,
+            args.iree_target_triple,
         ]
         if args.decomp_attn:
             pipe_id_list.append("decomp")
@@ -730,8 +763,8 @@ def run_diffusers_cpu(
         args.max_length,
         args.batch_size,
         args.num_inference_steps,
-        args.device,
-        args.iree_target_triple,
+        devices,
+        targets,
         ireec_flags,
         args.attn_spec,
         args.decomp_attn,
@@ -747,7 +780,7 @@ def run_diffusers_cpu(
         vmfbs.pop("scheduler")
         weights.pop("scheduler")
     sd3_pipe.load_pipeline(
-        vmfbs, weights, args.rt_device, args.compiled_pipeline, args.split_scheduler
+        vmfbs, weights, args.compiled_pipeline, args.split_scheduler
     )
     sd3_pipe.generate_images(
         args.prompt,
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 4489141d6..a862e0d39 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -12,6 +12,17 @@
     # DPMSolverSDEScheduler,
 )
 
+_IREE_DEVICE_MAP = {
+    "cpu": "local-task",
+    "cpu-task": "local-task",
+    "cpu-sync": "local-sync",
+    "cuda": "cuda",
+    "vulkan": "vulkan",
+    "metal": "metal",
+    "rocm": "rocm",
+    "hip": "hip",
+    "intel-gpu": "level_zero",
+}
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
 MI_flags = {
     "all": [
@@ -81,6 +92,19 @@
     ],
 }
 
+def iree_device_map(device):
+    uri_parts = device.split("://", 2)
+    iree_driver = (
+        _IREE_DEVICE_MAP[uri_parts[0]]
+        if uri_parts[0] in _IREE_DEVICE_MAP
+        else uri_parts[0]
+    )
+    if len(uri_parts) == 1:
+        return iree_driver
+    elif "rocm" in uri_parts:
+        return "rocm"
+    else:
+        return f"{iree_driver}://{uri_parts[1]}"
 
 def compile_to_vmfb(
     module_str,

From 9656135ad27f830ad3cb2cfb2438fb24272a6928 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 09:40:04 -0500
Subject: [PATCH 125/174] comment trace tensor

---
 .../custom_models/sd3_inference/sd3_schedulers.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 06c23ef1c..9d5968772 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -66,7 +66,7 @@ def __init__(
     def initialize(self, sample):
         step_count = torch.tensor(len(self.timesteps))
         timesteps = self.model.timesteps
-        ops.trace_tensor("sample", sample[:,:,0,0])
+        #ops.trace_tensor("sample", sample[:,:,0,0])
         return (
             sample,
             step_count,

From f6ab086dd19aa06a6b3c238b89be92e304fd134e Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 10:37:36 -0500
Subject: [PATCH 126/174] Multi-device support (SDXL)

---
 .../sd3_inference/sd3_cmd_opts.py             |   8 +
 .../sd3_inference/sd3_pipeline.py             |   7 +-
 .../custom_models/sd3_inference/sd3_vae.py    |   3 +
 .../custom_models/sd_inference/schedulers.py  |   6 -
 .../sdxl_inference/sdxl_cmd_opts.py           |  58 ++++++-
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 162 ++++++++++++------
 .../custom_models/sdxl_inference/vae.py       |  82 +++++----
 7 files changed, 236 insertions(+), 90 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
index aec606e3e..b3250ea35 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -177,6 +177,14 @@ def is_valid_file(arg):
     help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
 )
 
+p.add_argument(
+    "--npu_delegate_path",
+    type=str,
+    default=None,
+    help="Path to npu executable plugin .dll for running VAE on NPU.",
+)
+
+
 p.add_argument(
     "--clip_device",
     default=None,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index ed71a7f9a..4754fe6da 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -774,13 +774,18 @@ def run_diffusers_cpu(
         args.vae_decomp_attn,
         custom_vae=None,
         cpu_scheduling=args.cpu_scheduling,
+        vae_precision=args.vae_precision,
     )
     vmfbs, weights = sd3_pipe.check_prepared(mlirs, vmfbs, weights)
     if args.cpu_scheduling:
         vmfbs.pop("scheduler")
         weights.pop("scheduler")
+    if args.npu_delegate_path:
+        extra_device_args = {"npu_delegate_path": args.npu_delegate_path}
+    else:
+        extra_device_args = {}
     sd3_pipe.load_pipeline(
-        vmfbs, weights, args.compiled_pipeline, args.split_scheduler
+        vmfbs, weights, args.compiled_pipeline, args.split_scheduler, extra_device_args=extra_device_args
     )
     sd3_pipe.generate_images(
         args.prompt,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
index 9789be7cd..a70c19882 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
@@ -90,6 +90,9 @@ def export_vae_model(
         )
         return vmfb_path
 
+    if device == "cpu":
+        decomp_attn = True
+
     if dtype == torch.float16:
         vae_model = vae_model.half()
     mapper = {}
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index ec58a0d64..bb26e95d1 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -160,7 +160,6 @@ def initialize(self, sample):
         step_indexes = torch.tensor(len(self.module.timesteps))
         timesteps = self.timesteps
         sample = sample * self.module.init_noise_sigma
-        print(sample, add_time_ids, step_indexes, timesteps)
         add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
         return sample, add_time_ids, step_indexes, timesteps
 
@@ -184,11 +183,6 @@ def step(self, noise_pred, t, latents, guidance_scale, i):
             noise_pred = noise_pred_uncond + guidance_scale * (
                 noise_pred_text - noise_pred_uncond
             )
-        print(
-            noise_pred[:, :, 0, 2],
-            t,
-            latents[:, :, 0, 2],
-        )
         return self.module.step(
             noise_pred,
             t,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
index 7acf5d528..5d5bde32f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
@@ -125,7 +125,7 @@ def is_valid_file(arg):
 
 p.add_argument(
     "--split_scheduler",
-    default=False,
+    default=True,
     action="store_true",
     help="Use a decoupled unet and scheduler for better QOL.",
 )
@@ -158,6 +158,62 @@ def is_valid_file(arg):
     help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
 )
 
+p.add_argument(
+    "--vae_precision",
+    type=str,
+    default="fp16",
+    help="Precision of VAE weights and graph.",
+)
+
+p.add_argument(
+    "--npu_delegate_path",
+    type=str,
+    default=None,
+    help="Path to npu executable plugin .dll for running VAE on NPU.",
+)
+
+p.add_argument(
+    "--clip_device",
+    default=None,
+    type=str,
+    help="Device to run CLIP on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--unet_device",
+    default=None,
+    type=str,
+    help="Device to run unet on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--vae_device",
+    default=None,
+    type=str,
+    help="Device to run VAE on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--clip_target",
+    default=None,
+    type=str,
+    help="IREE target for CLIP compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
+p.add_argument(
+    "--unet_target",
+    default=None,
+    type=str,
+    help="IREE target for unet compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
+p.add_argument(
+    "--vae_target",
+    default=None,
+    type=str,
+    help="IREE target for vae compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
 ##############################################################################
 # SDXL Modelling Options
 #    These options are used to control model defining parameters for SDXL.
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 2edc2866c..8b128eff8 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -28,22 +28,6 @@
 import copy
 from datetime import datetime as dt
 
-device_list = [
-    "cpu",
-    "vulkan",
-    "cuda",
-    "rocm",
-]
-
-rt_device_list = [
-    "local-task",
-    "local-sync",
-    "vulkan",
-    "cuda",
-    "rocm",
-    "hip",
-]
-
 empty_pipe_dict = {
     "vae_decode": None,
     "prompt_encoder": None,
@@ -71,8 +55,8 @@ def __init__(
         max_length: int,
         batch_size: int,
         num_inference_steps: int,
-        device: str,
-        iree_target_triple: str,
+        device: str | dict[str],
+        iree_target_triple: str | dict[str],
         ireec_flags: dict = EMPTY_FLAGS,
         attn_spec: str = None,
         decomp_attn: bool = False,
@@ -82,6 +66,7 @@ def __init__(
         vae_decomp_attn: bool = True,
         custom_vae: str = "",
         cpu_scheduling: bool = False,
+        vae_precision: str = "fp16",
     ):
         self.hf_model_name = hf_model_name
         self.scheduler_id = scheduler_id
@@ -91,8 +76,41 @@ def __init__(
         self.max_length = max_length
         self.batch_size = batch_size
         self.num_inference_steps = num_inference_steps
-        self.device = device
-        self.iree_target_triple = iree_target_triple
+        self.devices = {}
+        if isinstance(device, dict):
+            assert isinstance(iree_target_triple, dict), "Device and target triple must be both dicts or both strings."
+            self.devices["clip"] = {
+                "device": device["clip"],
+                "driver": utils.iree_device_map(device["clip"]),
+                "target": iree_target_triple["clip"]
+            }
+            self.devices["unet"] = {
+                "device": device["unet"],
+                "driver": utils.iree_device_map(device["unet"]),
+                "target": iree_target_triple["unet"]
+            }
+            self.devices["vae"] = {
+                "device": device["vae"],
+                "driver": utils.iree_device_map(device["vae"]),
+                "target": iree_target_triple["vae"]
+            }
+        else:
+            assert isinstance(iree_target_triple, str), "Device and target triple must be both dicts or both strings."
+            self.devices["clip"] = {
+                "device": device,
+                "driver": utils.iree_device_map(device),
+                "target": iree_target_triple
+            }
+            self.devices["unet"] = {
+                "device": device,
+                "driver": utils.iree_device_map(device),
+                "target": iree_target_triple
+            }
+            self.devices["vae"] = {
+                "device": device,
+                "driver": utils.iree_device_map(device),
+                "target": iree_target_triple
+            }
         self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
         self.attn_spec = attn_spec
         self.decomp_attn = decomp_attn
@@ -100,6 +118,8 @@ def __init__(
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
+        self.vae_precision = vae_precision
+        self.vae_dtype = "float32" if vae_precision == "fp32" else "float16"
         self.custom_vae = custom_vae
         self.cpu_scheduling = cpu_scheduling
         # TODO: set this based on user-inputted guidance scale and negative prompt.
@@ -319,8 +339,8 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     unet_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["unet"]["device"],
+                    self.devices["unet"]["target"],
                     self.ireec_flags["unet"],
                     self.decomp_attn,
                     exit_on_vmfb=False,
@@ -348,8 +368,8 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     unet_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["unet"]["device"],
+                    self.devices["unet"]["target"],
                     self.ireec_flags["unet"],
                     self.decomp_attn,
                     exit_on_vmfb=False,
@@ -373,8 +393,8 @@ def export_submodel(
                         self.num_inference_steps,
                         self.precision,
                         "vmfb",
-                        self.device,
-                        self.iree_target_triple,
+                        self.devices["unet"]["device"],
+                        self.devices["unet"]["target"],
                         self.ireec_flags["scheduler"],
                         exit_on_vmfb=False,
                         pipeline_dir=self.pipeline_dir,
@@ -392,12 +412,12 @@ def export_submodel(
                     self.batch_size,
                     self.height,
                     self.width,
-                    self.precision,
+                    self.vae_precision,
                     "vmfb",
                     self.external_weights,
                     vae_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["vae"]["device"],
+                    self.devices["vae"]["target"],
                     self.ireec_flags["vae"],
                     "decode",
                     self.vae_decomp_attn,
@@ -418,8 +438,8 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     prompt_encoder_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["clip"]["device"],
+                    self.devices["clip"]["target"],
                     self.ireec_flags["clip"],
                     exit_on_vmfb=False,
                     pipeline_dir=self.pipeline_dir,
@@ -440,8 +460,8 @@ def export_submodel(
                 )
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["unet"]["device"],
+                    self.devices["unet"]["target"],
                     self.ireec_flags["pipeline"],
                     os.path.join(self.pipeline_dir, "pipeline"),
                     return_path=True,
@@ -459,8 +479,8 @@ def export_submodel(
                 )
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
-                    self.device,
-                    self.iree_target_triple,
+                    self.devices["unet"]["device"],
+                    self.devices["unet"]["target"],
                     self.ireec_flags["pipeline"],
                     os.path.join(self.pipeline_dir, "full_pipeline"),
                     return_path=True,
@@ -474,16 +494,20 @@ def load_pipeline(
         self,
         vmfbs: dict,
         weights: dict,
-        rt_device: str = "local-task",
         compiled_pipeline: bool = False,
         split_scheduler: bool = True,
+        extra_device_args: dict = {},
     ):
+        if "npu_delegate_path" in extra_device_args.keys():
+            delegate = extra_device_args["npu_delegate_path"]
+        else:
+            delegate = None
         self.runners = {}
         runners = {}
         load_start = time.time()
         if split_scheduler:
             runners["pipe"] = vmfbRunner(
-                rt_device,
+                self.devices["unet"]["driver"],
                 vmfbs["unet"],
                 weights["unet"],
             )
@@ -491,7 +515,7 @@ def load_pipeline(
             print("\n[LOG] Unet loaded in ", unet_loaded - load_start, "sec")
             if not self.cpu_scheduling:
                 runners["scheduler"] = schedulers.SharkSchedulerWrapper(
-                    args.device,
+                    self.devices["unet"]["driver"],
                     vmfbs["scheduler"],
                 )
             else:
@@ -509,22 +533,25 @@ def load_pipeline(
             sched_loaded = time.time()
             print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
             runners["vae_decode"] = vmfbRunner(
-                rt_device,
+                self.devices["vae"]["driver"],
                 vmfbs["vae_decode"],
                 weights["vae_decode"],
+                extra_plugin=delegate,
             )
             vae_loaded = time.time()
             print("\n[LOG] VAE Decode loaded in ", vae_loaded - sched_loaded, "sec")
             runners["prompt_encoder"] = vmfbRunner(
-                rt_device,
+                self.devices["clip"]["driver"],
                 vmfbs["prompt_encoder"],
                 weights["prompt_encoder"],
             )
             clip_loaded = time.time()
             print("\n[LOG] CLIP loaded in ", clip_loaded - vae_loaded, "sec")
         elif compiled_pipeline:
+            assert self.devices["unet"]["device"] == self.devices["clip"]["device"] == self.devices["vae"]["device"], "Compiled pipeline requires all submodels to be on the same device."
+            assert self.precision == self.vae_precision, "Compiled pipeline requires all submodels to have the same precision for now."
             runners["pipe"] = vmfbRunner(
-                rt_device,
+                self.devices["unet"]["driver"],
                 [
                     vmfbs["scheduled_unet"],
                     vmfbs["prompt_encoder"],
@@ -545,7 +572,7 @@ def load_pipeline(
 
         else:
             runners["pipe"] = vmfbRunner(
-                rt_device,
+                self.devices["unet"]["driver"],
                 [
                     vmfbs["scheduled_unet"],
                     vmfbs["pipeline"],
@@ -758,12 +785,10 @@ def generate_images(
                             step_index,
                         )
                     if isinstance(sample, torch.Tensor):
-                        # TODO: pipe an option for vae_dtype
-                        vae_dtype = "float32" if self.precision == "fp32" else "float16"
                         latents = ireert.asdevicearray(
                             self.runners["vae_decode"].config.device,
                             sample,
-                            dtype=vae_dtype,
+                            dtype=self.vae_dtype,
                         )
                     else:
                         latents = sample
@@ -771,9 +796,11 @@ def generate_images(
                     latents = self.runners["pipe"].ctx.modules.sdxl_compiled_pipeline[
                         "produce_image_latents"
                     ](samples[i], prompt_embeds, add_text_embeds, guidance_scale)
-
+                if self.devices["unet"]["driver"] != self.devices["vae"]["driver"] or self.precision != self.vae_precision:
+                    latents = ireert.asdevicearray(
+                        self.runners["vae_decode"].config.device, latents.to_host(), dtype=self.vae_dtype
+                    )
                 vae_start = time.time()
-                np.save("latents_winter_cat.npy", latents.to_host().astype(np.float32))
                 vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
                     latents
                 )
@@ -781,7 +808,6 @@ def generate_images(
                 pipe_end = time.time()
 
                 image = vae_out.to_host()
-                np.save("image_winter_cat.npy", image.astype(np.float32))
                 numpy_images.append(image)
                 print("Batch #", i + 1, "\n")
                 print(
@@ -871,6 +897,35 @@ def numpy_to_pil_image(images):
     mlirs = copy.deepcopy(map)
     vmfbs = copy.deepcopy(map)
     weights = copy.deepcopy(map)
+
+    if any(x for x in [args.clip_device, args.unet_device, args.vae_device]):
+        assert all(
+            x for x in [args.clip_device, args.unet_device, args.vae_device]
+        ), "Please specify device for all submodels or pass --device for all submodels."
+        assert all(
+            x for x in [args.clip_target, args.unet_target, args.vae_target]
+        ), "Please specify target triple for all submodels or pass --iree_target_triple for all submodels."
+        args.device = "hybrid"
+        args.iree_target_triple = "_".join([args.clip_target, args.unet_target, args.vae_target])
+    else:
+        args.clip_device = args.device
+        args.unet_device = args.device
+        args.vae_device = args.device
+        args.clip_target = args.iree_target_triple
+        args.unet_target = args.iree_target_triple
+        args.vae_target = args.iree_target_triple
+
+    devices = {
+        "clip": args.clip_device,
+        "unet": args.unet_device,
+        "vae": args.vae_device,
+    }
+    targets = {
+        "clip": args.clip_target,
+        "unet": args.unet_target,
+        "vae": args.vae_target,
+    }
+
     ireec_flags = {
         "clip": args.ireec_flags + args.clip_flags,
         "unet": args.ireec_flags + args.unet_flags,
@@ -911,8 +966,8 @@ def numpy_to_pil_image(images):
         args.max_length,
         args.batch_size,
         args.num_inference_steps,
-        args.device,
-        args.iree_target_triple,
+        devices,
+        targets,
         ireec_flags,
         args.attn_spec,
         args.decomp_attn,
@@ -922,12 +977,17 @@ def numpy_to_pil_image(images):
         args.vae_decomp_attn,
         custom_vae=None,
         cpu_scheduling=args.cpu_scheduling,
+        vae_precision=args.vae_precision,
     )
     vmfbs, weights = sdxl_pipe.check_prepared(mlirs, vmfbs, weights)
     if args.cpu_scheduling:
         vmfbs["scheduler"] = None
+    if args.npu_delegate_path:
+        extra_device_args = {"npu_delegate_path": args.npu_delegate_path}
+    else:
+        extra_device_args = {}
     sdxl_pipe.load_pipeline(
-        vmfbs, weights, args.rt_device, args.compiled_pipeline, args.split_scheduler
+        vmfbs, weights, args.compiled_pipeline, args.split_scheduler, extra_device_args,
     )
     sdxl_pipe.generate_images(
         args.prompt,
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index 6b21645e7..15ba92a6f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -54,12 +54,12 @@ def __init__(
             )
             self.vae.load_state_dict(custom_vae)
 
-    def decode_inp(self, inp):
-        inp = 1 / 0.13025 * inp
-        x = self.vae.decode(inp, return_dict=False)[0]
+    def decode(self, inp):
+        img = 1 / 0.13025 * inp
+        x = self.vae.decode(img, return_dict=False)[0]
         return (x / 2 + 0.5).clamp(0, 1)
 
-    def encode_inp(self, inp):
+    def encode(self, inp):
         latents = self.vae.encode(inp).latent_dist.sample()
         return 0.13025 * latents
 
@@ -105,45 +105,65 @@ def export_vae_model(
         )
         return vmfb_path
 
-    mapper = {}
-    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
-    if decomp_attn == True:
-        decomp_list.extend(
-            [
-                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-                torch.ops.aten._scaled_dot_product_flash_attention.default,
-            ]
-        )
+    if device == "cpu":
+        decomp_attn = True
+    
     dtype = torch.float16 if precision == "fp16" else torch.float32
     if precision == "fp16":
         vae_model = vae_model.half()
+
+    mapper = {}
+
     utils.save_external_weights(
         mapper, vae_model, external_weights, external_weight_path
     )
     if weights_only:
         return external_weight_path
-    sample = (batch_size, 4, height // 8, width // 8)
-    if variant == "encode":
-        sample = (batch_size, 3, height, width)
+    
+    input_image_shape = (height, width, 3)
+    input_latents_shape = (batch_size, 4, height // 8, width // 8)
+    encode_args = [
+        torch.empty(
+            input_image_shape,
+            dtype=torch.float32,
+        )
+    ]
+    decode_args = [
+        torch.empty(
+            input_latents_shape,
+            dtype=dtype,
+        )
+    ]
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+        fxb = FxProgramsBuilder(vae_model)
 
-    class CompiledVae(CompiledModule):
-        if external_weights:
-            params = export_parameters(
-                vae_model, external=True, external_scope="", name_mapper=mapper.get
-            )
-        else:
-            params = export_parameters(vae_model)
+        # @fxb.export_program(args=(encode_args,))
+        # def _encode(module, inputs,):
+        #     return module.encode(*inputs)
+
+        @fxb.export_program(args=(decode_args,))
+        def _decode(module, inputs):
+            return module.decode(*inputs)
 
-        def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
-            if variant == "decode":
-                return jittable(vae_model.decode_inp, decompose_ops=decomp_list)(inp)
-            elif variant == "encode":
-                return jittable(vae_model.encode_inp, decompose_ops=decomp_list)(inp)
+        class CompiledVae(CompiledModule):
+            main = _decode
+
+        if external_weights:
+            externalize_module_parameters(vae_model)
 
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledVae(context=Context(), import_to=import_to)
+        inst = CompiledVae(context=Context(), import_to="IMPORT")
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
+        module_str = str(CompiledModule.get_mlir_module(inst))
 
     if compile_to != "vmfb":
         return module_str

From 7b861a76061cc5178f68e9007cac33485db1877d Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 10:38:05 -0500
Subject: [PATCH 127/174] Fix formatting

---
 .../custom_models/sd3_inference/sd3_mmdit.py  |  8 +--
 .../sd3_inference/sd3_mmdit_runner.py         | 16 ++++--
 .../sd3_inference/sd3_pipeline.py             | 37 +++++++++-----
 .../sd3_inference/sd3_schedulers.py           |  3 +-
 .../custom_models/sd_inference/utils.py       |  2 +
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 49 +++++++++++++------
 .../custom_models/sdxl_inference/vae.py       |  4 +-
 7 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index 07a2f1e2a..9d6ea012d 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -52,10 +52,11 @@ def forward(
             return_dict=False,
         )[0]
         return noise_pred
-    
+
+
 class MMDiTAttention(torch.nn.Module):
     def __init__(
-            self,
+        self,
     ):
         super().__init__()
 
@@ -84,7 +85,7 @@ def export_attn(
 
     if dtype == torch.float16:
         attn_module = attn_module.half()
-    
+
     example_qkv = [
         torch.empty(qkv_shape, dtype=dtype),
         torch.empty(qkv_shape, dtype=dtype),
@@ -134,6 +135,7 @@ class CompiledAttn(CompiledModule):
         )
     return vmfb_path
 
+
 @torch.no_grad()
 def export_mmdit_model(
     mmdit_model,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
index f547b59bb..a0be81192 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
@@ -57,6 +57,7 @@ def run_diffusers_mmdit(
 
     return noise_pred.numpy()
 
+
 def run_attn_turbine(q, k, v, args):
     attn_runner = vmfbRunner(
         args.device,
@@ -73,6 +74,7 @@ def run_attn_turbine(q, k, v, args):
     ).to_host()
     return attn_output
 
+
 @torch.no_grad()
 def run_attn_torch(q, k, v, args):
     from turbine_models.custom_models.sd3_inference.sd3_mmdit import MMDiTAttention
@@ -86,6 +88,7 @@ def run_attn_torch(q, k, v, args):
 
     return attn_output.numpy()
 
+
 def find_errs(turbine_output, torch_output, dim=[], failed_dims=[], errs=[]):
     if not np.allclose(turbine_output, torch_output, rtol=4e-2, atol=4e-2):
         if turbine_output.ndim > 0:
@@ -93,14 +96,19 @@ def find_errs(turbine_output, torch_output, dim=[], failed_dims=[], errs=[]):
             for idx, i in enumerate(torch_output):
                 dim = [*orig_dim, idx]
                 try:
-                    np.testing.assert_allclose(turbine_output[idx], torch_output[idx], rtol=4e-2, atol=4e-2)
+                    np.testing.assert_allclose(
+                        turbine_output[idx], torch_output[idx], rtol=4e-2, atol=4e-2
+                    )
                 except Exception as e:
                     err = np.abs(turbine_output[idx] - torch_output[idx])
                     failed_dims.append(dim)
                     errs.append([err, turbine_output[idx], torch_output[idx]])
-                    failed_dims, errs = find_errs(turbine_output[idx], torch_output[idx], dim, failed_dims, errs)
+                    failed_dims, errs = find_errs(
+                        turbine_output[idx], torch_output[idx], dim, failed_dims, errs
+                    )
     return (failed_dims, errs)
 
+
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
     import numpy as np
@@ -137,8 +145,8 @@ def find_errs(turbine_output, torch_output, dim=[], failed_dims=[], errs=[]):
                 print("Torch output: ", errs[idx][2])
         print(torch_output.shape)
         exit()
-    
-    batch_size = args.batch_size * 2 #do classifier free guidance
+
+    batch_size = args.batch_size * 2  # do classifier free guidance
     hidden_states = torch.randn(
         (batch_size, 16, args.height // 8, args.width // 8), dtype=dtype
     )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 4754fe6da..686e2b453 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -75,38 +75,42 @@ def __init__(
         self.num_inference_steps = num_inference_steps
         self.devices = {}
         if isinstance(device, dict):
-            assert isinstance(iree_target_triple, dict), "Device and target triple must be both dicts or both strings."
+            assert isinstance(
+                iree_target_triple, dict
+            ), "Device and target triple must be both dicts or both strings."
             self.devices["clip"] = {
                 "device": device["clip"],
                 "driver": utils.iree_device_map(device["clip"]),
-                "target": iree_target_triple["clip"]
+                "target": iree_target_triple["clip"],
             }
             self.devices["mmdit"] = {
                 "device": device["mmdit"],
                 "driver": utils.iree_device_map(device["mmdit"]),
-                "target": iree_target_triple["mmdit"]
+                "target": iree_target_triple["mmdit"],
             }
             self.devices["vae"] = {
                 "device": device["vae"],
                 "driver": utils.iree_device_map(device["vae"]),
-                "target": iree_target_triple["vae"]
+                "target": iree_target_triple["vae"],
             }
         else:
-            assert isinstance(iree_target_triple, str), "Device and target triple must be both dicts or both strings."
+            assert isinstance(
+                iree_target_triple, str
+            ), "Device and target triple must be both dicts or both strings."
             self.devices["clip"] = {
                 "device": device,
                 "driver": utils.iree_device_map(device),
-                "target": iree_target_triple
+                "target": iree_target_triple,
             }
             self.devices["mmdit"] = {
                 "device": device,
                 "driver": utils.iree_device_map(device),
-                "target": iree_target_triple
+                "target": iree_target_triple,
             }
             self.devices["vae"] = {
                 "device": device,
                 "driver": utils.iree_device_map(device),
-                "target": iree_target_triple
+                "target": iree_target_triple,
             }
         self.iree_target_triple = iree_target_triple
         self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
@@ -645,7 +649,8 @@ def generate_images(
                 image.save(img_path)
                 print(img_path, "saved")
         return
-    
+
+
 def run_diffusers_cpu(
     hf_model_name,
     prompt,
@@ -658,7 +663,9 @@ def run_diffusers_cpu(
 ):
     from diffusers import StableDiffusion3Pipeline
 
-    pipe = StableDiffusion3Pipeline.from_pretrained(hf_model_name, torch_dtype=torch.float32)
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        hf_model_name, torch_dtype=torch.float32
+    )
     pipe = pipe.to("cpu")
     generator = torch.Generator().manual_seed(int(seed))
 
@@ -703,7 +710,9 @@ def run_diffusers_cpu(
             x for x in [args.clip_target, args.mmdit_target, args.vae_target]
         ), "Please specify target triple for all submodels or pass --iree_target_triple for all submodels."
         args.device = "hybrid"
-        args.iree_target_triple = "_".join([args.clip_target, args.mmdit_target, args.vae_target])
+        args.iree_target_triple = "_".join(
+            [args.clip_target, args.mmdit_target, args.vae_target]
+        )
     else:
         args.clip_device = args.device
         args.mmdit_device = args.device
@@ -785,7 +794,11 @@ def run_diffusers_cpu(
     else:
         extra_device_args = {}
     sd3_pipe.load_pipeline(
-        vmfbs, weights, args.compiled_pipeline, args.split_scheduler, extra_device_args=extra_device_args
+        vmfbs,
+        weights,
+        args.compiled_pipeline,
+        args.split_scheduler,
+        extra_device_args=extra_device_args,
     )
     sd3_pipe.generate_images(
         args.prompt,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 9d5968772..86179746a 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -66,7 +66,7 @@ def __init__(
     def initialize(self, sample):
         step_count = torch.tensor(len(self.timesteps))
         timesteps = self.model.timesteps
-        #ops.trace_tensor("sample", sample[:,:,0,0])
+        # ops.trace_tensor("sample", sample[:,:,0,0])
         return (
             sample,
             step_count,
@@ -93,6 +93,7 @@ def step(self, noise_pred, t, sample, guidance_scale, i):
         sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
         return sample.type(self.dtype)
 
+
 # Wraps a diffusers scheduler running on native pytorch+cpu.
 # This allows us to use it interchangeably with compiled schedulers in our pipeline(s).
 class TorchCPUFlowSchedulerCompat:
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index a862e0d39..e4b755131 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -92,6 +92,7 @@
     ],
 }
 
+
 def iree_device_map(device):
     uri_parts = device.split("://", 2)
     iree_driver = (
@@ -106,6 +107,7 @@ def iree_device_map(device):
     else:
         return f"{iree_driver}://{uri_parts[1]}"
 
+
 def compile_to_vmfb(
     module_str,
     device,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 8b128eff8..550e42679 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -78,38 +78,42 @@ def __init__(
         self.num_inference_steps = num_inference_steps
         self.devices = {}
         if isinstance(device, dict):
-            assert isinstance(iree_target_triple, dict), "Device and target triple must be both dicts or both strings."
+            assert isinstance(
+                iree_target_triple, dict
+            ), "Device and target triple must be both dicts or both strings."
             self.devices["clip"] = {
                 "device": device["clip"],
                 "driver": utils.iree_device_map(device["clip"]),
-                "target": iree_target_triple["clip"]
+                "target": iree_target_triple["clip"],
             }
             self.devices["unet"] = {
                 "device": device["unet"],
                 "driver": utils.iree_device_map(device["unet"]),
-                "target": iree_target_triple["unet"]
+                "target": iree_target_triple["unet"],
             }
             self.devices["vae"] = {
                 "device": device["vae"],
                 "driver": utils.iree_device_map(device["vae"]),
-                "target": iree_target_triple["vae"]
+                "target": iree_target_triple["vae"],
             }
         else:
-            assert isinstance(iree_target_triple, str), "Device and target triple must be both dicts or both strings."
+            assert isinstance(
+                iree_target_triple, str
+            ), "Device and target triple must be both dicts or both strings."
             self.devices["clip"] = {
                 "device": device,
                 "driver": utils.iree_device_map(device),
-                "target": iree_target_triple
+                "target": iree_target_triple,
             }
             self.devices["unet"] = {
                 "device": device,
                 "driver": utils.iree_device_map(device),
-                "target": iree_target_triple
+                "target": iree_target_triple,
             }
             self.devices["vae"] = {
                 "device": device,
                 "driver": utils.iree_device_map(device),
-                "target": iree_target_triple
+                "target": iree_target_triple,
             }
         self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
         self.attn_spec = attn_spec
@@ -548,8 +552,14 @@ def load_pipeline(
             clip_loaded = time.time()
             print("\n[LOG] CLIP loaded in ", clip_loaded - vae_loaded, "sec")
         elif compiled_pipeline:
-            assert self.devices["unet"]["device"] == self.devices["clip"]["device"] == self.devices["vae"]["device"], "Compiled pipeline requires all submodels to be on the same device."
-            assert self.precision == self.vae_precision, "Compiled pipeline requires all submodels to have the same precision for now."
+            assert (
+                self.devices["unet"]["device"]
+                == self.devices["clip"]["device"]
+                == self.devices["vae"]["device"]
+            ), "Compiled pipeline requires all submodels to be on the same device."
+            assert (
+                self.precision == self.vae_precision
+            ), "Compiled pipeline requires all submodels to have the same precision for now."
             runners["pipe"] = vmfbRunner(
                 self.devices["unet"]["driver"],
                 [
@@ -796,9 +806,14 @@ def generate_images(
                     latents = self.runners["pipe"].ctx.modules.sdxl_compiled_pipeline[
                         "produce_image_latents"
                     ](samples[i], prompt_embeds, add_text_embeds, guidance_scale)
-                if self.devices["unet"]["driver"] != self.devices["vae"]["driver"] or self.precision != self.vae_precision:
+                if (
+                    self.devices["unet"]["driver"] != self.devices["vae"]["driver"]
+                    or self.precision != self.vae_precision
+                ):
                     latents = ireert.asdevicearray(
-                        self.runners["vae_decode"].config.device, latents.to_host(), dtype=self.vae_dtype
+                        self.runners["vae_decode"].config.device,
+                        latents.to_host(),
+                        dtype=self.vae_dtype,
                     )
                 vae_start = time.time()
                 vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
@@ -906,7 +921,9 @@ def numpy_to_pil_image(images):
             x for x in [args.clip_target, args.unet_target, args.vae_target]
         ), "Please specify target triple for all submodels or pass --iree_target_triple for all submodels."
         args.device = "hybrid"
-        args.iree_target_triple = "_".join([args.clip_target, args.unet_target, args.vae_target])
+        args.iree_target_triple = "_".join(
+            [args.clip_target, args.unet_target, args.vae_target]
+        )
     else:
         args.clip_device = args.device
         args.unet_device = args.device
@@ -987,7 +1004,11 @@ def numpy_to_pil_image(images):
     else:
         extra_device_args = {}
     sdxl_pipe.load_pipeline(
-        vmfbs, weights, args.compiled_pipeline, args.split_scheduler, extra_device_args,
+        vmfbs,
+        weights,
+        args.compiled_pipeline,
+        args.split_scheduler,
+        extra_device_args,
     )
     sdxl_pipe.generate_images(
         args.prompt,
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index 15ba92a6f..b5753b346 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -107,7 +107,7 @@ def export_vae_model(
 
     if device == "cpu":
         decomp_attn = True
-    
+
     dtype = torch.float16 if precision == "fp16" else torch.float32
     if precision == "fp16":
         vae_model = vae_model.half()
@@ -119,7 +119,7 @@ def export_vae_model(
     )
     if weights_only:
         return external_weight_path
-    
+
     input_image_shape = (height, width, 3)
     input_latents_shape = (batch_size, 4, height // 8, width // 8)
     encode_args = [

From 80486831a9d03605f48f0c85ab5e36290ac367f4 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 10:44:46 -0500
Subject: [PATCH 128/174] adds PEFT req. for lora scaling export issue and fix
 diffusers

---
 models/requirements.txt | 1 +
 models/setup.py         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index 87f92e7c6..ead79c1d9 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -4,6 +4,7 @@ shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 transformers==4.37.1
 torchsde
 accelerate
+peft
 diffusers @ git+https://github.com/nod-ai/diffusers@0.29.0.dev0-shark
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
diff --git a/models/setup.py b/models/setup.py
index 2c54c7d43..09d60cfe3 100644
--- a/models/setup.py
+++ b/models/setup.py
@@ -57,7 +57,7 @@ def load_version_info():
         "Shark-Turbine",
         "protobuf",
         "sentencepiece",
-        "transformers==4.37.1",
+        "transformers>=4.37.1",
         "accelerate",
         "diffusers==0.29.0.dev0",
         "azure-storage-blob",

From 05660c00ccd2d60df8912991240c8a49f26564c0 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 11:57:24 -0500
Subject: [PATCH 129/174] Add scheduler_id to sd3 api for unified signature

---
 .../turbine_models/custom_models/sd3_inference/sd3_pipeline.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 686e2b453..5a80b2633 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -44,7 +44,6 @@ class SharkSD3Pipeline:
     def __init__(
         self,
         hf_model_name: str,
-        # scheduler_id: str,
         height: int,
         width: int,
         shift: float,
@@ -63,6 +62,7 @@ def __init__(
         vae_decomp_attn: bool = True,
         custom_vae: str = "",
         cpu_scheduling: bool = False,
+        scheduler_id: str = None, #compatibility only, always uses EulerFlowScheduler
     ):
         self.hf_model_name = hf_model_name
         # self.scheduler_id = scheduler_id

From 4692e11339a295a16e963181bbb6d2841714388e Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 17 Jun 2024 18:08:06 -0500
Subject: [PATCH 130/174] Remove sentencepiece from reqs.

---
 models/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index ead79c1d9..bdd1892e8 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -1,5 +1,4 @@
 protobuf
-sentencepiece
 shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 transformers==4.37.1
 torchsde

From 8b775aae35a5a0b13bc4081115327ef74747a077 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 18 Jun 2024 01:12:24 -0500
Subject: [PATCH 131/174] Temporarily comment out create_hal_driver usage for
 old iree version compat (DNM)

---
 models/turbine_models/model_runner.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/models/turbine_models/model_runner.py b/models/turbine_models/model_runner.py
index 1b27ca83b..41dc8746e 100644
--- a/models/turbine_models/model_runner.py
+++ b/models/turbine_models/model_runner.py
@@ -1,7 +1,7 @@
 import argparse
 import sys
 from iree import runtime as ireert
-from iree.runtime._binding import create_hal_driver
+#from iree.runtime._binding import create_hal_driver
 
 
 class vmfbRunner:
@@ -11,14 +11,14 @@ def __init__(self, device, vmfb_path, external_weight_path=None, extra_plugin=No
         # If an extra plugin is requested, add a global flag to load the plugin
         # and create the driver using the non-caching creation function, as
         # the caching creation function may ignore the flag.
-        if extra_plugin:
-            ireert.flags.parse_flags(f"--executable_plugin={extra_plugin}")
-            haldriver = create_hal_driver(device)
+        # if extra_plugin:
+        #     ireert.flags.parse_flags(f"--executable_plugin={extra_plugin}")
+        #     haldriver = create_hal_driver(device)
 
         # No plugin requested: create the driver with the caching create
         # function.
-        else:
-            haldriver = ireert.get_driver(device)
+        #else:
+        haldriver = ireert.get_driver(device)
         if "://" in device:
             try:
                 device_idx = int(device.split("://")[-1])

From fd2a2ba40e2110e666f91847c1c2c5a19a4126fd Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 18 Jun 2024 16:00:32 -0500
Subject: [PATCH 132/174] Fixes for vae precision/attn decomposition, numerics
 validation

---
 .../sd3_inference/sd3_cmd_opts.py             |  8 ++++-
 .../sd3_inference/sd3_pipeline.py             | 32 +++++++++++--------
 .../sd3_inference/sd3_vae_runner.py           | 25 ++++++++++-----
 .../sd3_inference/text_encoder_impls.py       |  4 ++-
 .../sdxl_inference/unet_runner.py             |  6 ++--
 models/turbine_models/model_runner.py         | 12 +++----
 6 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
index b3250ea35..ac97d77e4 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -247,6 +247,12 @@ def is_valid_file(arg):
     default="fp16",
     help="Precision of Stable Diffusion weights and graph.",
 )
+p.add_argument(
+    "--vae_precision",
+    type=str,
+    default=None,
+    help="Precision of Stable Diffusion VAE weights and graph.",
+)
 p.add_argument(
     "--max_length", type=int, default=77, help="Sequence Length of Stable Diffusion"
 )
@@ -257,7 +263,7 @@ def is_valid_file(arg):
 p.add_argument(
     "--vae_decomp_attn",
     type=bool,
-    default=True,
+    default=False,
     help="Decompose attention for VAE decode only at fx graph level",
 )
 p.add_argument(
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 5a80b2633..7f1ec7022 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -46,7 +46,6 @@ def __init__(
         hf_model_name: str,
         height: int,
         width: int,
-        shift: float,
         precision: str,
         max_length: int,
         batch_size: int,
@@ -59,10 +58,12 @@ def __init__(
         pipeline_dir: str = "./shark_vmfbs",
         external_weights_dir: str = "./shark_weights",
         external_weights: str = "safetensors",
-        vae_decomp_attn: bool = True,
-        custom_vae: str = "",
+        vae_decomp_attn: bool = False,
         cpu_scheduling: bool = False,
+        vae_precision: str = "fp32",
         scheduler_id: str = None, #compatibility only, always uses EulerFlowScheduler
+        shift: float = 1.0,
+
     ):
         self.hf_model_name = hf_model_name
         # self.scheduler_id = scheduler_id
@@ -120,10 +121,11 @@ def __init__(
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
-        self.custom_vae = custom_vae
+        self.custom_vae = None
         self.cpu_scheduling = cpu_scheduling
         self.torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
-        self.vae_dtype = torch.float32
+        self.vae_precision = vae_precision if vae_precision else self.precision
+        self.vae_dtype = torch.float32 if vae_precision == "fp32" else torch.float16
         # TODO: set this based on user-inputted guidance scale and negative prompt.
         self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
@@ -206,7 +208,12 @@ def is_prepared(self, vmfbs, weights):
             )
             if w_key == "clip":
                 default_name = os.path.join(
-                    self.external_weights_dir, f"sd3_clip_fp16.irpa"
+                    self.external_weights_dir, f"sd3_text_encoders_{self.precision}.irpa"
+                )
+            if w_key == "mmdit":
+                default_name = os.path.join(
+                    self.external_weights_dir,
+                    f"sd3_mmdit_{self.precision}." + self.external_weights,
                 )
             if weights[w_key] is None and os.path.exists(default_name):
                 weights[w_key] = os.path.join(default_name)
@@ -357,7 +364,7 @@ def export_submodel(
                     self.batch_size,
                     self.height,
                     self.width,
-                    "fp32",
+                    self.vae_precision,
                     "vmfb",
                     self.external_weights,
                     vae_external_weight_path,
@@ -586,7 +593,8 @@ def generate_images(
                     dtype=self.vae_dtype,
                 )
             else:
-                latents = sample.astype("float32")
+                vae_numpy_dtype = np.float32 if self.vae_precision == "fp32" else np.float16
+                latents = sample.astype(vae_numpy_dtype)
 
             vae_start = time.time()
             vae_out = self.runners["vae"].ctx.modules.compiled_vae["decode"](latents)
@@ -634,7 +642,7 @@ def generate_images(
             out_image = Image.fromarray(image)
             images.extend([[out_image]])
         if return_imgs:
-            return images
+            return images[0]
         for idx_batch, image_batch in enumerate(images):
             for idx, image in enumerate(image_batch):
                 img_path = (
@@ -767,7 +775,6 @@ def run_diffusers_cpu(
         args.hf_model_name,
         args.height,
         args.width,
-        args.shift,
         args.precision,
         args.max_length,
         args.batch_size,
@@ -779,9 +786,8 @@ def run_diffusers_cpu(
         args.decomp_attn,
         args.pipeline_dir,
         args.external_weights_dir,
-        args.external_weights,
-        args.vae_decomp_attn,
-        custom_vae=None,
+        external_weights=args.external_weights,
+        vae_decomp_attn=args.vae_decomp_attn,
         cpu_scheduling=args.cpu_scheduling,
         vae_precision=args.vae_precision,
     )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
index 23db4ab73..31b23b429 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
@@ -15,8 +15,8 @@ def run_vae(
 ):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
     inputs = [ireert.asdevicearray(runner.config.device, example_input)]
-    results = runner.ctx.modules.compiled_vae["decode"](*inputs)
-
+    results = runner.ctx.modules.compiled_vae["decode"](*inputs).to_host()
+    results = imagearray_from_vae_out(results)
     return results
 
 
@@ -32,11 +32,19 @@ def run_torch_vae(hf_model_name, variant, example_input):
     elif variant == "encode":
         results = vae_model.encode(example_input)
     np_torch_output = results.detach().cpu().numpy()
+    np_torch_output = imagearray_from_vae_out(np_torch_output)  
     return np_torch_output
 
+def imagearray_from_vae_out(image):
+    if image.ndim == 4:
+        image = image[0]
+    image = torch.from_numpy(image).cpu().permute(1, 2, 0).float().numpy()
+    image = (image * 255).round().astype("uint8")
+    return image
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+    import numpy as np
 
     dtype = torch.float16 if args.precision == "fp16" else torch.float32
     if args.vae_variant == "decode":
@@ -57,9 +65,9 @@ def run_torch_vae(hf_model_name, variant, example_input):
     )
     print(
         "TURBINE OUTPUT:",
-        turbine_results.to_host(),
-        turbine_results.to_host().shape,
-        turbine_results.to_host().dtype,
+        turbine_results,
+        turbine_results.shape,
+        turbine_results.dtype,
     )
     if args.compare_vs_torch:
         print("generating torch output: ")
@@ -69,9 +77,10 @@ def run_torch_vae(hf_model_name, variant, example_input):
             args.hf_model_name, args.vae_variant, example_input.float()
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
-        err = utils.largest_error(torch_output, turbine_results)
-        print("Largest Error: ", err)
-        assert err < 2e-3
+        # Allow a small amount of wiggle room for rounding errors (1)
+        np.testing.assert_allclose(
+            turbine_results, torch_output, rtol=1, atol=1
+        )
 
     # TODO: Figure out why we occasionally segfault without unlinking output variables
     turbine_results = None
diff --git a/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py b/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
index 29b9d2f80..747b60d9b 100644
--- a/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
+++ b/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
@@ -341,8 +341,10 @@ def __init__(self):
         self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
         self.t5xxl = T5XXLTokenizer()
 
-    def tokenize_with_weights(self, text: str):
+    def tokenize_with_weights(self, text: str | list[str]):
         out = {}
+        if isinstance(text, list):
+            text = text[0]
         out["g"] = self.clip_g.tokenize_with_weights(text)
         out["l"] = self.clip_l.tokenize_with_weights(text)
         out["t5xxl"] = self.t5xxl.tokenize_with_weights(text)
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index 4437b9eae..9d0b405c3 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -31,9 +31,8 @@ def run_unet(
         ireert.asdevicearray(runner.config.device, prompt_embeds),
         ireert.asdevicearray(runner.config.device, text_embeds),
         ireert.asdevicearray(runner.config.device, time_ids),
-        ireert.asdevicearray(runner.config.device, guidance_scale),
     ]
-    results = runner.ctx.modules.compiled_unet["main"](*inputs)
+    results = runner.ctx.modules.compiled_unet["run_forward"](*inputs)
 
     return results
 
@@ -57,7 +56,6 @@ def run_unet_steps(
         ireert.asdevicearray(runner.config.device, prompt_embeds),
         ireert.asdevicearray(runner.config.device, text_embeds),
         ireert.asdevicearray(runner.config.device, time_ids),
-        ireert.asdevicearray(runner.config.device, (guidance_scale,)),
     ]
     for i, t in tqdm(enumerate(scheduler.timesteps)):
         timestep = t
@@ -69,7 +67,7 @@ def run_unet_steps(
         inputs[1] = timestep = ireert.asdevicearray(
             runner.config.device, (timestep,), dtype="int64"
         )
-        noise_pred = runner.ctx.modules.compiled_unet["main"](*inputs).to_host()
+        noise_pred = runner.ctx.modules.compiled_unet["run_forward"](*inputs).to_host()
         sample = scheduler.step(
             torch.from_numpy(noise_pred).cpu(),
             timestep,
diff --git a/models/turbine_models/model_runner.py b/models/turbine_models/model_runner.py
index 41dc8746e..1b27ca83b 100644
--- a/models/turbine_models/model_runner.py
+++ b/models/turbine_models/model_runner.py
@@ -1,7 +1,7 @@
 import argparse
 import sys
 from iree import runtime as ireert
-#from iree.runtime._binding import create_hal_driver
+from iree.runtime._binding import create_hal_driver
 
 
 class vmfbRunner:
@@ -11,14 +11,14 @@ def __init__(self, device, vmfb_path, external_weight_path=None, extra_plugin=No
         # If an extra plugin is requested, add a global flag to load the plugin
         # and create the driver using the non-caching creation function, as
         # the caching creation function may ignore the flag.
-        # if extra_plugin:
-        #     ireert.flags.parse_flags(f"--executable_plugin={extra_plugin}")
-        #     haldriver = create_hal_driver(device)
+        if extra_plugin:
+            ireert.flags.parse_flags(f"--executable_plugin={extra_plugin}")
+            haldriver = create_hal_driver(device)
 
         # No plugin requested: create the driver with the caching create
         # function.
-        #else:
-        haldriver = ireert.get_driver(device)
+        else:
+            haldriver = ireert.get_driver(device)
         if "://" in device:
             try:
                 device_idx = int(device.split("://")[-1])

From b1f20f1c07e6b75d358fffbfc3672f9dd6a49fc8 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 18 Jun 2024 20:28:35 -0500
Subject: [PATCH 133/174] Fix numerics, add some features to VAE runner, add
 cpu scheduling options

---
 .../sd3_inference/sd3_cmd_opts.py             |   6 +
 .../custom_models/sd3_inference/sd3_mmdit.py  |   2 +-
 .../sd3_inference/sd3_mmdit_runner.py         |   3 +-
 .../sd3_inference/sd3_pipeline.py             | 109 +++++++++++++-----
 .../sd3_inference/sd3_schedulers.py           |  41 ++++++-
 .../custom_models/sd3_inference/sd3_vae.py    |   1 +
 .../sd3_inference/sd3_vae_runner.py           |  16 ++-
 7 files changed, 141 insertions(+), 37 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
index ac97d77e4..55cf3b72d 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -346,6 +346,12 @@ def is_valid_file(arg):
     action="store_true",
     help="Just compile attention reproducer for mmdit.",
 )
+p.add_argument(
+    "--vae_input_path",
+    type=str,
+    default=None,
+    help="Path to input latents for VAE inference numerics validation.",
+)
 
 
 ##############################################################################
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index 9d6ea012d..05d3e00cb 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -207,7 +207,7 @@ def export_mmdit_model(
         torch.empty(hidden_states_shape, dtype=dtype),
         torch.empty(encoder_hidden_states_shape, dtype=dtype),
         torch.empty(pooled_projections_shape, dtype=dtype),
-        torch.empty(1, dtype=dtype),
+        torch.empty(init_batch_dim, dtype=dtype),
     ]
 
     decomp_list = []
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
index a0be81192..06100eab3 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit_runner.py
@@ -154,7 +154,7 @@ def find_errs(turbine_output, torch_output, dim=[], failed_dims=[], errs=[]):
         (batch_size, args.max_length * 2, 4096), dtype=dtype
     )
     pooled_projections = torch.randn((batch_size, 2048), dtype=dtype)
-    timestep = torch.tensor([0], dtype=dtype)
+    timestep = torch.tensor([0, 0], dtype=dtype)
 
     turbine_output = run_mmdit_turbine(
         hidden_states,
@@ -180,6 +180,7 @@ def find_errs(turbine_output, torch_output, dim=[], failed_dims=[], errs=[]):
             timestep,
             args,
         )
+        np.save("torch_mmdit_output.npy", torch_output.astype(np.float16))
         print("torch OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
 
         print("\n(torch (comfy) image latents to iree image latents): ")
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 7f1ec7022..303ba326e 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -17,6 +17,7 @@
 from turbine_models.custom_models.sd_inference import utils
 from turbine_models.model_runner import vmfbRunner
 from transformers import CLIPTokenizer
+from diffusers import FlowMatchEulerDiscreteScheduler
 
 from PIL import Image
 import os
@@ -426,10 +427,16 @@ def load_pipeline(
         unet_loaded = time.time()
         print("\n[LOG] MMDiT loaded in ", unet_loaded - load_start, "sec")
 
-        runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
-            self.devices["mmdit"]["driver"],
-            vmfbs["scheduler"],
-        )
+        if not self.cpu_scheduling:
+            runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
+                self.devices["mmdit"]["driver"],
+                vmfbs["scheduler"],
+            )
+        else:
+            print("Using torch CPU scheduler.")
+            runners["scheduler"] = FlowMatchEulerDiscreteScheduler.from_pretrained(
+                self.hf_model_name, subfolder="scheduler"
+            )
 
         sched_loaded = time.time()
         print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
@@ -502,11 +509,12 @@ def generate_images(
                 )
             )
 
-        guidance_scale = ireert.asdevicearray(
-            self.runners["pipe"].config.device,
-            np.asarray([guidance_scale]),
-            dtype=iree_dtype,
-        )
+        if not self.cpu_scheduling:
+            guidance_scale = ireert.asdevicearray(
+                self.runners["pipe"].config.device,
+                np.asarray([guidance_scale]),
+                dtype=iree_dtype,
+            )
 
         tokenize_start = time.time()
         text_input_ids_dict = self.tokenizer.tokenize_with_weights(prompt)
@@ -540,12 +548,23 @@ def generate_images(
             "clip"
         ].ctx.modules.compiled_text_encoder["encode_tokens"](*text_encoders_inputs)
         encode_prompts_end = time.time()
+        if self.cpu_scheduling:
+            timesteps, num_inference_steps = sd3_schedulers.retrieve_timesteps(
+                self.runners["scheduler"],
+                num_inference_steps=self.num_inference_steps, 
+                timesteps=None,
+            )
+            steps = num_inference_steps
+
 
         for i in range(batch_count):
             unet_start = time.time()
-            sample, steps, timesteps = self.runners["scheduler"].initialize(samples[i])
+            if not self.cpu_scheduling:
+                latents, steps, timesteps = self.runners["scheduler"].initialize(samples[i])
+            else:
+                latents = torch.tensor(samples[i].to_host(), dtype=self.torch_dtype)
             iree_inputs = [
-                sample,
+                latents,
                 ireert.asdevicearray(
                     self.runners["pipe"].config.device, prompt_embeds, dtype=iree_dtype
                 ),
@@ -560,41 +579,71 @@ def generate_images(
                 # print(f"step {s}")
                 if self.cpu_scheduling:
                     step_index = s
+                    t = timesteps[s]
+                    if self.do_classifier_free_guidance:
+                        latent_model_input = torch.cat([latents] * 2)
+                    timestep = ireert.asdevicearray(
+                        self.runners["pipe"].config.device,
+                        t.expand(latent_model_input.shape[0]),
+                        dtype=iree_dtype,
+                    )
+                    latent_model_input = ireert.asdevicearray(
+                        self.runners["pipe"].config.device,
+                        latent_model_input,
+                        dtype=iree_dtype,
+                    )
                 else:
                     step_index = ireert.asdevicearray(
                         self.runners["scheduler"].runner.config.device,
                         torch.tensor([s]),
                         "int64",
                     )
-                latents, t = self.runners["scheduler"].prep(
-                    sample,
-                    step_index,
-                    timesteps,
-                )
+                    latent_model_input, timestep = self.runners["scheduler"].prep(
+                        latents,
+                        step_index,
+                        timesteps,
+                    )
+                    t = ireert.asdevicearray(
+                        self.runners["scheduler"].runner.config.device,
+                        timestep.to_host()[0]
+                    )
                 noise_pred = self.runners["pipe"].ctx.modules.compiled_mmdit[
                     "run_forward"
                 ](
-                    latents,
+                    latent_model_input,
                     iree_inputs[1],
                     iree_inputs[2],
-                    t,
-                )
-                sample = self.runners["scheduler"].step(
-                    noise_pred,
-                    t,
-                    sample,
-                    guidance_scale,
-                    step_index,
+                    timestep,
                 )
-            if isinstance(sample, torch.Tensor):
+                if not self.cpu_scheduling:
+                    latents = self.runners["scheduler"].step(
+                        noise_pred,
+                        t,
+                        latents,
+                        guidance_scale,
+                        step_index,
+                    )
+                else:
+                    noise_pred = torch.tensor(noise_pred.to_host(), dtype=self.torch_dtype)
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    latents = self.runners["scheduler"].step(
+                        noise_pred,
+                        t,
+                        latents,
+                        return_dict=False,
+                    )[0]
+
+            if isinstance(latents, torch.Tensor):
+                latents = latents.type(self.vae_dtype)
                 latents = ireert.asdevicearray(
                     self.runners["vae"].config.device,
-                    sample,
-                    dtype=self.vae_dtype,
+                    latents,
                 )
             else:
                 vae_numpy_dtype = np.float32 if self.vae_precision == "fp32" else np.float16
-                latents = sample.astype(vae_numpy_dtype)
+                latents = latents.astype(vae_numpy_dtype)
 
             vae_start = time.time()
             vae_out = self.runners["vae"].ctx.modules.compiled_vae["decode"](latents)
@@ -791,10 +840,10 @@ def run_diffusers_cpu(
         cpu_scheduling=args.cpu_scheduling,
         vae_precision=args.vae_precision,
     )
-    vmfbs, weights = sd3_pipe.check_prepared(mlirs, vmfbs, weights)
     if args.cpu_scheduling:
         vmfbs.pop("scheduler")
         weights.pop("scheduler")
+    vmfbs, weights = sd3_pipe.check_prepared(mlirs, vmfbs, weights)
     if args.npu_delegate_path:
         extra_device_args = {"npu_delegate_path": args.npu_delegate_path}
     else:
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 86179746a..0fe4ae0d8 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -5,9 +5,11 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
+import inspect
 from typing import List
 
 import torch
+from typing import Any, Callable, Dict, List, Optional, Union
 from shark_turbine.aot import *
 import shark_turbine.ops.iree as ops
 from iree.compiler.ir import Context
@@ -75,11 +77,12 @@ def initialize(self, sample):
 
     def prepare_model_input(self, sample, t, timesteps):
         t = timesteps[t]
-        t = t.expand(sample.shape[0])
+
         if self.do_classifier_free_guidance:
             latent_model_input = torch.cat([sample] * 2)
         else:
             latent_model_input = sample
+        t = t.expand(sample.shape[0])
         return latent_model_input.type(self.dtype), t.type(self.dtype)
 
     def step(self, noise_pred, t, sample, guidance_scale, i):
@@ -146,6 +149,42 @@ def step(self, noise_pred, t, latents, guidance_scale, i):
             return_dict=False,
         )[0]
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+# Only used for cpu scheduling.
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 
 @torch.no_grad()
 def export_scheduler_model(
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
index a70c19882..5bd6f0f5b 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
@@ -33,6 +33,7 @@ def __init__(
         )
 
     def decode(self, inp):
+        inp = (inp / self.vae.config.scaling_factor) + self.vae.config.shift_factor
         image = self.vae.decode(inp, return_dict=False)[0]
         image = image.float()
         image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
index 31b23b429..9cb435bde 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
@@ -45,12 +45,17 @@ def imagearray_from_vae_out(image):
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
     import numpy as np
+    from PIL import Image
 
     dtype = torch.float16 if args.precision == "fp16" else torch.float32
     if args.vae_variant == "decode":
         example_input = torch.rand(
             args.batch_size, 16, args.height // 8, args.width // 8, dtype=dtype
         )
+        if args.vae_input_path:
+            example_input = np.load(args.vae_input_path)
+            if example_input.shape[0] == 2:
+                example_input = np.split(example_input, 2)[0]
     elif args.vae_variant == "encode":
         example_input = torch.rand(
             args.batch_size, 3, args.height, args.width, dtype=dtype
@@ -74,13 +79,16 @@ def imagearray_from_vae_out(image):
         from turbine_models.custom_models.sd_inference import utils
 
         torch_output = run_torch_vae(
-            args.hf_model_name, args.vae_variant, example_input.float()
+            args.hf_model_name, args.vae_variant, torch.tensor(example_input).float()
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
+        if args.vae_input_path:
+            out_image_torch = Image.fromarray(torch_output)
+            out_image_torch.save("vae_test_output_torch.png")
+            out_image_turbine = Image.fromarray(turbine_results)
+            out_image_turbine.save("vae_test_output_turbine.png")
         # Allow a small amount of wiggle room for rounding errors (1)
+
         np.testing.assert_allclose(
             turbine_results, torch_output, rtol=1, atol=1
         )
-
-    # TODO: Figure out why we occasionally segfault without unlinking output variables
-    turbine_results = None

From 618d01f9b725d1c60e0c6da4302bf0976792ea3c Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Tue, 18 Jun 2024 21:12:41 -0500
Subject: [PATCH 134/174] Point to azure links for specs and fix timesteps dim
 in gpu scheduler.

---
 .../custom_models/sd3_inference/sd3_schedulers.py         | 2 +-
 models/turbine_models/custom_models/sd_inference/utils.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 0fe4ae0d8..2efb13aa9 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -82,7 +82,7 @@ def prepare_model_input(self, sample, t, timesteps):
             latent_model_input = torch.cat([sample] * 2)
         else:
             latent_model_input = sample
-        t = t.expand(sample.shape[0])
+        t = t.expand(latent_model_input.shape[0])
         return latent_model_input.type(self.dtype), t.type(self.dtype)
 
     def step(self, noise_pred, t, sample, guidance_scale, i):
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index e4b755131..0931a4028 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -35,7 +35,7 @@
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-rocm-waves-per-eu=2",
         "--iree-flow-inline-constants-max-byte-length=1",
-        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics, iree-linalg-ext-pad-attention{pad-to-multiple-of=0,128,0,32,0}))",
     ],
     "unet": [
         "--iree-flow-enable-aggressive-fusion",
@@ -275,7 +275,7 @@ def create_safe_name(hf_model_name, model_name_str):
 
 
 def get_mfma_spec_path(target_chip, save_dir):
-    url = "https://raw.githubusercontent.com/iree-org/iree/main/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir"
+    url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx942.mlir"
     attn_spec = urlopen(url).read().decode("utf-8")
     spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
     if os.path.exists(spec_path):
@@ -287,9 +287,9 @@ def get_mfma_spec_path(target_chip, save_dir):
 
 def get_wmma_spec_path(target_chip, save_dir):
     if target_chip == "gfx1100":
-        url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1100.spec.mlir"
+        url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx1100.mlir"
     elif target_chip in ["gfx1103", "gfx1150"]:
-        url = "https://github.com/iree-org/iree/raw/shared/tresleches-united/scripts/attention_gfx1103.spec.mlir"
+        url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx1150.mlir"
     else:
         return None
     attn_spec = urlopen(url).read().decode("utf-8")

From 92be65b3125c058e4130bf36ff1b809fef39cc87 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Wed, 19 Jun 2024 04:23:38 -0500
Subject: [PATCH 135/174] Fixes to filename handling and model loading.

---
 .../custom_models/sd3_inference/sd3_mmdit.py  |  13 +-
 .../sd3_inference/sd3_pipeline.py             | 117 +++++++++------
 .../sd3_inference/sd3_schedulers.py           |  31 ++--
 .../sd3_inference/sd3_text_encoders.py        |  17 ++-
 .../custom_models/sd3_inference/sd3_vae.py    |  16 +-
 .../custom_models/sd_inference/schedulers.py  |  34 ++---
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 140 ++++++++++--------
 .../sdxl_inference/sdxl_prompt_encoder.py     |  15 +-
 .../sdxl_inference/sdxl_scheduled_unet.py     |  16 +-
 .../custom_models/sdxl_inference/unet.py      |  14 +-
 .../custom_models/sdxl_inference/vae.py       |  22 +--
 11 files changed, 238 insertions(+), 197 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index 05d3e00cb..8f4ac25be 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -160,13 +160,12 @@ def export_mmdit_model(
     weights_only=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
+    safe_name = utils.create_safe_name(
+        hf_model_name,
+        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_mmdit",
+    )
     if pipeline_dir:
-        safe_name = os.path.join(pipeline_dir, f"mmdit")
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name,
-            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_mmdit",
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
     if decomp_attn == True:
         ireec_flags += ",--iree-opt-aggressively-propagate-transposes=False"
 
@@ -250,7 +249,7 @@ class CompiledMmdit(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=True,
             attn_spec=attn_spec,
         )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 303ba326e..15b53a96c 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -6,6 +6,7 @@
 
 import logging
 import torch
+from tqdm.auto import tqdm
 from turbine_models.custom_models.sd3_inference import (
     sd3_text_encoders,
     sd3_mmdit,
@@ -74,7 +75,7 @@ def __init__(
         self.precision = precision
         self.max_length = max_length
         self.batch_size = batch_size
-        self.num_inference_steps = num_inference_steps
+        self.num_inference_steps = None
         self.devices = {}
         if isinstance(device, dict):
             assert isinstance(
@@ -129,7 +130,7 @@ def __init__(
         self.vae_dtype = torch.float32 if vae_precision == "fp32" else torch.float16
         # TODO: set this based on user-inputted guidance scale and negative prompt.
         self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
-
+        self._interrupt = False
     # FILE MANAGEMENT AND PIPELINE SETUP
 
     def check_prepared(
@@ -152,7 +153,7 @@ def check_prepared(
             if do_continue.lower() == "y":
                 for submodel in vmfbs.keys():
                     if vmfbs[submodel] == None:
-                        print(submodel)
+                        print("Fetching: ", submodel)
                         vmfb, weight = self.export_submodel(submodel, input_mlir=mlirs)
                         vmfbs[submodel] = vmfb
                         if weights[submodel] is None:
@@ -175,28 +176,28 @@ def check_prepared(
 
     def is_prepared(self, vmfbs, weights):
         missing = []
+        height = self.height
+        width = self.width
         for key in vmfbs:
-            if key == "scheduler" and not self.cpu_scheduling:
-                val = f"EulerFlowScheduler_{self.num_inference_steps}"
-                default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
-            elif key == "scheduler":
-                val = None
-                default_filepath = None
+            if key == "scheduler":
                 continue
+            elif key == "vae":
+                keywords = ["vae", self.vae_precision, height, width]
+                device_key = "vae"
             elif key == "clip":
-                val = "text_encoders"
-                default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
-            else:
-                val = vmfbs[key]
-                default_filepath = os.path.join(self.pipeline_dir, key + ".vmfb")
-            if vmfbs[key] is not None and os.path.exists(vmfbs[key]):
-                continue
-            elif vmfbs[key] == None and os.path.exists(default_filepath):
-                vmfbs[key] = default_filepath
-            elif val is None:
-                missing.append(key + ".vmfb")
+                keywords = ["text_encoders", self.precision, self.max_length]
+                device_key = "clip"
             else:
-                missing.append(val + ".vmfb")
+                keywords = [key, self.precision, self.max_length, height, width]
+                device_key = key
+            avail_files = os.listdir(self.pipeline_dir)
+            keywords.append("vmfb")
+            keywords.append(self.devices[device_key]["target"])
+            for filename in avail_files:
+                if all(str(x) in filename for x in keywords):
+                    vmfbs[key] = os.path.join(self.pipeline_dir, filename)
+            if not vmfbs[key]:
+                missing.append(key + " vmfb")
         for w_key in weights:
             if any(x in w_key for x in ["pipeline", "scheduler"]):
                 continue
@@ -267,7 +268,7 @@ def export_submodel(
             if not os.path.exists(self.external_weights_dir):
                 os.makedirs(self.external_weights_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.external_weights_dir, "vae." + self.external_weights
+                self.external_weights_dir, f"sd3_vae_{self.vae_precision}." + self.external_weights
             )
             mmdit_external_weight_path = os.path.join(
                 self.external_weights_dir,
@@ -290,7 +291,7 @@ def export_submodel(
             if not os.path.exists(self.pipeline_dir):
                 os.makedirs(self.pipeline_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.pipeline_dir, "vae." + self.external_weights
+                self.pipeline_dir, f"sd3_vae_{self.vae_precision}." + self.external_weights
             )
             mmdit_external_weight_path = os.path.join(
                 self.pipeline_dir,
@@ -351,7 +352,7 @@ def export_submodel(
                     self.ireec_flags["scheduler"],
                     exit_on_vmfb=False,
                     pipeline_dir=self.pipeline_dir,
-                    input_mlir=input_mlir["scheduler"],
+                    input_mlir=None,
                 )
                 return scheduler_vmfb, None
             case "vae":
@@ -427,19 +428,19 @@ def load_pipeline(
         unet_loaded = time.time()
         print("\n[LOG] MMDiT loaded in ", unet_loaded - load_start, "sec")
 
-        if not self.cpu_scheduling:
-            runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
-                self.devices["mmdit"]["driver"],
-                vmfbs["scheduler"],
-            )
-        else:
-            print("Using torch CPU scheduler.")
-            runners["scheduler"] = FlowMatchEulerDiscreteScheduler.from_pretrained(
-                self.hf_model_name, subfolder="scheduler"
-            )
-
-        sched_loaded = time.time()
-        print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
+        # if not self.cpu_scheduling:
+        #     runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
+        #         self.devices["mmdit"]["driver"],
+        #         vmfbs["scheduler"],
+        #     )
+        # else:
+        #     print("Using torch CPU scheduler.")
+        #     runners["scheduler"] = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        #         self.hf_model_name, subfolder="scheduler"
+        #     )
+
+        # sched_loaded = time.time()
+        # print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
         runners["vae"] = vmfbRunner(
             self.devices["vae"]["driver"],
             vmfbs["vae"],
@@ -447,7 +448,7 @@ def load_pipeline(
             extra_plugin=delegate,
         )
         vae_loaded = time.time()
-        print("\n[LOG] VAE Decode loaded in ", vae_loaded - sched_loaded, "sec")
+        print("\n[LOG] VAE Decode loaded in ", vae_loaded - unet_loaded, "sec")
         runners["clip"] = vmfbRunner(
             self.devices["clip"]["driver"],
             vmfbs["clip"],
@@ -474,7 +475,34 @@ def generate_images(
         guidance_scale: float = 4,
         seed: float = -1,
         return_imgs: bool = False,
+        steps: int = None,
+        cpu_scheduling: bool = False,
+        scheduler_id: str = None,
+        progress=None,
     ):
+        needs_new_scheduler = (steps and steps != self.num_inference_steps) or cpu_scheduling != self.cpu_scheduling
+        self.cpu_scheduling = cpu_scheduling
+        if steps:
+            self.num_inference_steps = steps
+        if steps and not self.cpu_scheduling and needs_new_scheduler:
+            self.runners["scheduler"] = None
+            self.num_inference_steps = steps
+            scheduler_path = f"EulerFlowScheduler_{self.num_inference_steps}"
+            if not os.path.exists(scheduler_path):
+               scheduler_path, _ = self.export_submodel("scheduler") 
+            try:
+                self.runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
+                    self.devices["mmdit"]["driver"],
+                    scheduler_path,
+                )
+            except:
+                print("JIT export of scheduler failed. Loading CPU scheduler.")
+                self.cpu_scheduling = True
+        if self.cpu_scheduling and needs_new_scheduler:
+            self.runners["scheduler"] = FlowMatchEulerDiscreteScheduler.from_pretrained(
+                self.hf_model_name, subfolder="scheduler"
+            )
+
         # TODO: implement case where this is false e.g. in SDXL Turbo
         do_classifier_free_guidance = True
 
@@ -551,13 +579,16 @@ def generate_images(
         if self.cpu_scheduling:
             timesteps, num_inference_steps = sd3_schedulers.retrieve_timesteps(
                 self.runners["scheduler"],
-                num_inference_steps=self.num_inference_steps, 
+                num_inference_steps=steps, 
                 timesteps=None,
             )
             steps = num_inference_steps
 
 
         for i in range(batch_count):
+            if self._interrupt:
+                self._interrupt = False
+                return
             unet_start = time.time()
             if not self.cpu_scheduling:
                 latents, steps, timesteps = self.runners["scheduler"].initialize(samples[i])
@@ -575,7 +606,10 @@ def generate_images(
                 ),
                 None,
             ]
-            for s in range(steps):
+            for s in tqdm(iterable=range(steps), desc=f"Inference steps ({steps}), batch {i+1}"):
+                if self._interrupt:
+                    self._interrupt = False
+                    return
                 # print(f"step {s}")
                 if self.cpu_scheduling:
                     step_index = s
@@ -634,7 +668,6 @@ def generate_images(
                         latents,
                         return_dict=False,
                     )[0]
-
             if isinstance(latents, torch.Tensor):
                 latents = latents.type(self.vae_dtype)
                 latents = ireert.asdevicearray(
@@ -691,7 +724,7 @@ def generate_images(
             out_image = Image.fromarray(image)
             images.extend([[out_image]])
         if return_imgs:
-            return images[0]
+            return images
         for idx_batch, image_batch in enumerate(images):
             for idx, image in enumerate(image_batch):
                 img_path = (
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 2efb13aa9..7c6acbd03 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -206,31 +206,24 @@ def export_scheduler_model(
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
     scheduler_module = FlowSchedulingModel(hf_model_name, num_inference_steps, dtype)
+    vmfb_names = [
+        "EulerFlowScheduler",
+        f"bs{args.batch_size}_{args.height}x{args.width}",
+        precision,
+        str(num_inference_steps),
+        target_triple,
+    ]
+    vmfb_name = "_".join(vmfb_names)
+    safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
     if pipeline_dir:
-        vmfb_names = [
-            "EulerFlowScheduler",
-            str(num_inference_steps),
-        ]
-        vmfb_name = "_".join(vmfb_names)
-        safe_name = os.path.join(pipeline_dir, vmfb_name)
-    else:
-        vmfb_names = [
-            "EulerFlowScheduler",
-            f"bs{args.batch_size}_{args.height}x{args.width}",
-            precision,
-            str(num_inference_steps),
-            target_triple,
-        ]
-        vmfb_name = "_".join(vmfb_names)
-        safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
-
+        safe_name = os.path.join(pipeline_dir, safe_name)
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
         )
@@ -326,7 +319,7 @@ class CompiledScheduler(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=True,
         )
         if exit_on_vmfb:
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
index 97d41caf4..bebbee499 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -129,19 +129,20 @@ def export_text_encoders(
     output_batchsize=1,
     decomp_attn=True,
 ):
-    if pipeline_dir not in [None, ""]:
-        safe_name = os.path.join(pipeline_dir, "text_encoders")
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name, f"_{str(max_length)}_{precision}_text_encoders-{device}"
-        )
+
+    safe_name = utils.create_safe_name(
+        hf_model_name, f"_bs{output_batchsize}_{str(max_length)}_{precision}_text_encoders-{device}"
+    )
+    if pipeline_dir:
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
@@ -200,7 +201,7 @@ class CompiledTextEncoder(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
             attn_spec=attn_spec,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
index 5bd6f0f5b..e6578bb08 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
@@ -71,20 +71,20 @@ def export_vae_model(
     weights_only=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
+    safe_name = utils.create_safe_name(
+        hf_model_name,
+        f"_bs{batch_size}_{height}x{width}_{precision}_vae_{device}",
+    )
     if pipeline_dir:
-        safe_name = os.path.join(pipeline_dir, "vae")
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name,
-            f"_bs{batch_size}_{height}x{width}_{precision}_vae_{device}",
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
@@ -156,7 +156,7 @@ class CompiledVae(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
         )
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index bb26e95d1..2b258a950 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -214,24 +214,20 @@ def export_scheduler_model(
     scheduler_module = SchedulingModel(
         hf_model_name, scheduler, height, width, batch_size, num_inference_steps, dtype
     )
+
+    vmfb_names = [
+        scheduler_id + "Scheduler",
+        f"bs{batch_size}",
+        f"{height}x{width}",
+        precision,
+        str(num_inference_steps),
+        target_triple,
+    ]  
+    vmfb_name = "_".join(vmfb_names)
+    safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
     if pipeline_dir:
-        vmfb_names = [
-            scheduler_id + "Scheduler",
-            str(num_inference_steps),
-        ]
-        vmfb_name = "_".join(vmfb_names)
-        safe_name = os.path.join(pipeline_dir, vmfb_name)
-    else:
-        vmfb_names = [
-            scheduler_id + "Scheduler",
-            f"bs{batch_size}",
-            f"{height}x{width}",
-            precision,
-            str(num_inference_steps),
-            target_triple,
-        ]
-        vmfb_name = "_".join(vmfb_names)
-        safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
 
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
@@ -239,7 +235,7 @@ def export_scheduler_model(
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
         )
@@ -335,7 +331,7 @@ class CompiledScheduler(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=True,
         )
         if exit_on_vmfb:
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 550e42679..61679c604 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -48,7 +48,6 @@ class SharkSDXLPipeline:
     def __init__(
         self,
         hf_model_name: str,
-        scheduler_id: str,
         height: int,
         width: int,
         precision: str,
@@ -57,16 +56,17 @@ def __init__(
         num_inference_steps: int,
         device: str | dict[str],
         iree_target_triple: str | dict[str],
+        scheduler_id: str = "EulerDiscrete",
         ireec_flags: dict = EMPTY_FLAGS,
         attn_spec: str = None,
         decomp_attn: bool = False,
         pipeline_dir: str = "./shark_vmfbs",
         external_weights_dir: str = "./shark_weights",
         external_weights: str = "safetensors",
-        vae_decomp_attn: bool = True,
+        vae_decomp_attn: bool = False,
         custom_vae: str = "",
         cpu_scheduling: bool = False,
-        vae_precision: str = "fp16",
+        vae_precision: str = "fp32",
     ):
         self.hf_model_name = hf_model_name
         self.scheduler_id = scheduler_id
@@ -100,21 +100,13 @@ def __init__(
             assert isinstance(
                 iree_target_triple, str
             ), "Device and target triple must be both dicts or both strings."
-            self.devices["clip"] = {
-                "device": device,
-                "driver": utils.iree_device_map(device),
-                "target": iree_target_triple,
-            }
             self.devices["unet"] = {
                 "device": device,
                 "driver": utils.iree_device_map(device),
                 "target": iree_target_triple,
             }
-            self.devices["vae"] = {
-                "device": device,
-                "driver": utils.iree_device_map(device),
-                "target": iree_target_triple,
-            }
+            self.devices["clip"] = self.devices["unet"]
+            self.devices["vae"] = self.devices["unet"]
         self.ireec_flags = ireec_flags if ireec_flags else EMPTY_FLAGS
         self.attn_spec = attn_spec
         self.decomp_attn = decomp_attn
@@ -151,7 +143,7 @@ def check_prepared(
             if do_continue.lower() == "y":
                 for submodel in vmfbs.keys():
                     if vmfbs[submodel] == None:
-                        print(submodel)
+                        print("Fetching: ", submodel)
                         vmfb, weight = self.export_submodel(submodel, input_mlir=mlirs)
                         vmfbs[submodel] = vmfb
                         if weights[submodel] is None:
@@ -174,28 +166,31 @@ def check_prepared(
 
     def is_prepared(self, vmfbs, weights):
         missing = []
+        height = self.height
+        width = self.width
         for key in vmfbs:
             if key == "scheduled_unet":
-                val = f"{self.scheduler_id}_unet_{self.num_inference_steps}"
-                default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
-            elif key == "scheduler" and not self.cpu_scheduling:
-                val = f"{self.scheduler_id}Scheduler_{self.num_inference_steps}"
-                default_filepath = os.path.join(self.pipeline_dir, val + ".vmfb")
+                keywords = ["unet", self.scheduler_id, self.num_inference_steps, self.precision, height, width]
+                device_key = "unet"
             elif key == "scheduler":
-                val = None
-                default_filepath = None
                 continue
+            elif key == "vae_decode":
+                keywords = ["vae", self.vae_precision, height, width]
+                device_key = "vae"
+            elif key == "prompt_encoder":
+                keywords = ["prompt_encoder", self.precision, self.max_length]
+                device_key = "clip"
             else:
-                val = vmfbs[key]
-                default_filepath = os.path.join(self.pipeline_dir, key + ".vmfb")
-            if vmfbs[key] is not None and os.path.exists(vmfbs[key]):
-                continue
-            elif vmfbs[key] == None and os.path.exists(default_filepath):
-                vmfbs[key] = default_filepath
-            elif val is None:
-                missing.append(key + ".vmfb")
-            else:
-                missing.append(val + ".vmfb")
+                keywords = [key, self.precision, self.max_length, height, width]
+                device_key = key
+            avail_files = os.listdir(self.pipeline_dir)
+            keywords.append("vmfb")
+            keywords.append(self.devices[device_key]["target"])
+            for filename in avail_files:
+                if all(str(x) in filename for x in keywords):
+                    vmfbs[key] = os.path.join(self.pipeline_dir, filename)
+            if not vmfbs[key]:
+                missing.append(key + " vmfb")
         for w_key in weights:
             if any(x in w_key for x in ["pipeline", "scheduler"]):
                 continue
@@ -258,7 +253,7 @@ def get_torch_models(self, submodel):
                     self.hf_model_name,
                     custom_vae=(
                         "madebyollin/sdxl-vae-fp16-fix"
-                        if self.precision == "fp16"
+                        if self.vae_precision == "fp16"
                         else self.custom_vae
                     ),
                 )
@@ -283,13 +278,13 @@ def export_submodel(
             if not os.path.exists(self.external_weights_dir):
                 os.makedirs(self.external_weights_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.external_weights_dir, "vae_decode." + self.external_weights
+                self.external_weights_dir, f"vae_decode_{self.vae_precision}." + self.external_weights
             )
             unet_external_weight_path = os.path.join(
-                self.external_weights_dir, "unet." + self.external_weights
+                self.external_weights_dir, f"unet_{self.precision}." + self.external_weights
             )
             prompt_encoder_external_weight_path = os.path.join(
-                self.external_weights_dir, "prompt_encoder." + self.external_weights
+                self.external_weights_dir, f"prompt_encoder_{self.precision}." + self.external_weights
             )
         elif self.external_weights is None:
             print(
@@ -302,17 +297,16 @@ def export_submodel(
             print(
                 f"No external weights directory specified using --external_weights_dir, we assume you have your own weights in {self.pipeline_dir}."
             )
-            external_weights_dir = self.pipeline_dir
             if not os.path.exists(self.pipeline_dir):
                 os.makedirs(self.pipeline_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.pipeline_dir, "vae_decode." + self.external_weights
+                self.pipeline_dir, f"vae_decode_{self.vae_precision}." + self.external_weights
             )
             unet_external_weight_path = os.path.join(
-                self.pipeline_dir, "unet." + self.external_weights
+                self.pipeline_dir, f"unet_{self.precision}." + self.external_weights
             )
             prompt_encoder_external_weight_path = os.path.join(
-                self.pipeline_dir, "prompt_encoder." + self.external_weights
+                self.pipeline_dir, f"prompt_encoder_{self.precision}." + self.external_weights
             )
         if weights_only:
             input_mlir = {
@@ -402,7 +396,7 @@ def export_submodel(
                         self.ireec_flags["scheduler"],
                         exit_on_vmfb=False,
                         pipeline_dir=self.pipeline_dir,
-                        input_mlir=input_mlir["scheduler"],
+                        input_mlir=None,
                     )
                     return scheduler_vmfb, None
             case "vae_decode":
@@ -510,6 +504,9 @@ def load_pipeline(
         runners = {}
         load_start = time.time()
         if split_scheduler:
+            # We get scheduler at generate time and set steps then.
+            self.num_inference_steps = None
+            self.split_scheduler = True
             runners["pipe"] = vmfbRunner(
                 self.devices["unet"]["driver"],
                 vmfbs["unet"],
@@ -517,25 +514,6 @@ def load_pipeline(
             )
             unet_loaded = time.time()
             print("\n[LOG] Unet loaded in ", unet_loaded - load_start, "sec")
-            if not self.cpu_scheduling:
-                runners["scheduler"] = schedulers.SharkSchedulerWrapper(
-                    self.devices["unet"]["driver"],
-                    vmfbs["scheduler"],
-                )
-            else:
-                print("\n[LOG] Running scheduler on CPU. This will affect performance.")
-                scheduler = schedulers.get_scheduler(
-                    args.hf_model_name, args.scheduler_id
-                )
-                runners["scheduler"] = schedulers.SharkSchedulerCPUWrapper(
-                    scheduler,
-                    args.batch_size,
-                    args.num_inference_steps,
-                    runners["pipe"].config.device,
-                    latents_dtype="float32" if args.precision == "fp32" else "float16",
-                )
-            sched_loaded = time.time()
-            print("\n[LOG] Scheduler loaded in ", sched_loaded - unet_loaded, "sec")
             runners["vae_decode"] = vmfbRunner(
                 self.devices["vae"]["driver"],
                 vmfbs["vae_decode"],
@@ -543,7 +521,7 @@ def load_pipeline(
                 extra_plugin=delegate,
             )
             vae_loaded = time.time()
-            print("\n[LOG] VAE Decode loaded in ", vae_loaded - sched_loaded, "sec")
+            print("\n[LOG] VAE Decode loaded in ", vae_loaded - unet_loaded, "sec")
             runners["prompt_encoder"] = vmfbRunner(
                 self.devices["clip"]["driver"],
                 vmfbs["prompt_encoder"],
@@ -627,7 +605,42 @@ def generate_images(
         guidance_scale: float = 7.5,
         seed: float = -1,
         return_imgs: bool = False,
+        steps: int = None,
+        cpu_scheduling: bool = False,
+        scheduler_id: str = "EulerDiscrete",
+        progress=None,
     ):
+        needs_new_scheduler = (steps and steps != self.num_inference_steps) or cpu_scheduling != self.cpu_scheduling
+        self.cpu_scheduling = cpu_scheduling
+        if steps and not self.compiled_pipeline and needs_new_scheduler:
+            self.num_inference_steps = steps
+        if steps and not self.cpu_scheduling and not self.compiled_pipeline and needs_new_scheduler:
+            self.runners["scheduler"] = None
+            self.num_inference_steps = steps
+            self.scheduler_id = scheduler_id
+            scheduler_path = f"{scheduler_id}Scheduler_{self.num_inference_steps}"
+            if not os.path.exists(scheduler_path):
+               scheduler_path, _ = self.export_submodel("scheduler") 
+            try:
+                self.runners["scheduler"] = schedulers.SharkSchedulerWrapper(
+                    self.devices["unet"]["driver"],
+                    scheduler_path,
+                )
+            except:
+                print("JIT export of scheduler failed. Loading CPU scheduler.")
+                self.cpu_scheduling = True
+        if self.cpu_scheduling and needs_new_scheduler:
+            scheduler = schedulers.get_scheduler(
+                self.hf_model_name, scheduler_id
+            )
+            self.runners["scheduler"] = schedulers.SharkSchedulerCPUWrapper(
+                scheduler,
+                self.batch_size,
+                self.num_inference_steps,
+                self.runners["pipe"].config.device,
+                latents_dtype="float32" if self.precision == "fp32" else "float16",
+            )
+
         # TODO: implement case where this is false e.g. in SDXL Turbo
         do_classifier_free_guidance = True
 
@@ -748,7 +761,7 @@ def generate_images(
 
             for i in range(batch_count):
                 unet_start = time.time()
-                if self.runners["scheduler"]:
+                if self.split_scheduler:
                     sample, time_ids, steps, timesteps = self.runners[
                         "scheduler"
                     ].initialize(samples[i])
@@ -982,7 +995,6 @@ def numpy_to_pil_image(images):
         args.precision,
         args.max_length,
         args.batch_size,
-        args.num_inference_steps,
         devices,
         targets,
         ireec_flags,
@@ -993,7 +1005,6 @@ def numpy_to_pil_image(images):
         args.external_weights,
         args.vae_decomp_attn,
         custom_vae=None,
-        cpu_scheduling=args.cpu_scheduling,
         vae_precision=args.vae_precision,
     )
     vmfbs, weights = sdxl_pipe.check_prepared(mlirs, vmfbs, weights)
@@ -1017,5 +1028,8 @@ def numpy_to_pil_image(images):
         args.guidance_scale,
         args.seed,
         False,
+        args.num_inference_steps,
+        cpu_scheduling=args.cpu_scheduling,
+        scheduler_id=args.scheduler_id,
     )
     print("Image generation complete.")
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 3df5607fc..f4174ca2a 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -166,19 +166,20 @@ def export_prompt_encoder(
         do_classifier_free_guidance = False
     else:
         do_classifier_free_guidance = True
+
+    safe_name = utils.create_safe_name(
+        hf_model_name, f"_bs{output_batchsize}_{str(max_length)}-{precision}-prompt-encoder-{device}"
+    )
     if pipeline_dir not in [None, ""]:
-        safe_name = os.path.join(pipeline_dir, "prompt_encoder")
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name, f"{str(max_length)}-{precision}-prompt-encoder-{device}"
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
@@ -259,7 +260,7 @@ def encode_prompts_turbo(
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
             attn_spec=attn_spec,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 21597d457..6ec6d11a5 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -171,14 +171,14 @@ def export_scheduled_unet_model(
     # else:
     #     do_classifier_free_guidance = True
     do_classifier_free_guidance = True
+
+    safe_name = utils.create_safe_name(
+        hf_model_name,
+        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_scheduled_unet_{str(num_inference_steps)}",
+    )
     if pipeline_dir:
         safe_name = os.path.join(
-            pipeline_dir, f"{scheduler_id}_unet_{str(num_inference_steps)}"
-        )
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name,
-            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_scheduled_unet_{device}",
+            pipeline_dir, safe_name
         )
 
     if input_mlir:
@@ -187,7 +187,7 @@ def export_scheduled_unet_model(
             device,
             iree_target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + iree_target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
@@ -280,7 +280,7 @@ class CompiledScheduledUnet(CompiledModule):
             device,
             iree_target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + iree_target_triple,
             return_path=True,
             attn_spec=attn_spec,
         )
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 701909ae5..6b45ab799 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -93,13 +93,13 @@ def export_unet_model(
     input_mlir=None,
     weights_only=False,
 ):
+    safe_name = utils.create_safe_name(
+        hf_model_name,
+        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet",
+    )
     if pipeline_dir:
-        safe_name = os.path.join(pipeline_dir, f"unet")
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name,
-            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet",
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
     if decomp_attn == True:
         ireec_flags += ",--iree-opt-aggressively-propagate-transposes=False"
 
@@ -190,7 +190,7 @@ class CompiledUnet(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=True,
             attn_spec=attn_spec,
         )
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index b5753b346..ed474256e 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -85,26 +85,29 @@ def export_vae_model(
     input_mlir=None,
     weights_only=False,
 ):
+    safe_name = utils.create_safe_name(
+        hf_model_name,
+        f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}",
+    )
     if pipeline_dir:
-        safe_name = os.path.join(pipeline_dir, "vae_" + variant)
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name,
-            f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}",
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
         )
         return vmfb_path
-
+    if precision == "fp32" and device == "rocm":
+        decomp_attn = True
+        external_weights = None
+        print("Decomposing attention and inlining weights for fp32 VAE on ROCm")
     if device == "cpu":
         decomp_attn = True
 
@@ -136,6 +139,7 @@ def export_vae_model(
     ]
     decomp_list = []
     if decomp_attn == True:
+        safe_name += "_decomp"
         decomp_list = [
             torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
             torch.ops.aten._scaled_dot_product_flash_attention.default,
@@ -173,7 +177,7 @@ class CompiledVae(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name,
+            safe_name + "_" + target_triple,
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
         )

From fd1543b7e070a78c4efaf15c0ad92e01c95bc7fa Mon Sep 17 00:00:00 2001
From: dan <danimal197@gmail.com>
Date: Wed, 19 Jun 2024 14:27:54 -0500
Subject: [PATCH 136/174] add clip test poc

---
 .../sd3_inference/sd3_cmd_opts.py             |   2 +-
 .../custom_models/sd3_inference/sd3_mmdit.py  |   2 +-
 models/turbine_models/tests/sd3_test.py       | 557 ++++++++++++++++++
 3 files changed, 559 insertions(+), 2 deletions(-)
 create mode 100644 models/turbine_models/tests/sd3_test.py

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
index 55cf3b72d..78acb4e5f 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -406,7 +406,7 @@ def is_valid_file(arg):
 )
 
 p.add_argument(
-    "--unet_flags",
+    "--mmdit_flags",
     type=str,
     default="",
     help="extra iree-compile options to send for compiling unet. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index 8f4ac25be..8b3176c8d 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -301,7 +301,7 @@ class CompiledMmdit(CompiledModule):
         args.external_weight_path,
         args.device,
         args.iree_target_triple,
-        args.ireec_flags + args.attn_flags + args.unet_flags,
+        args.ireec_flags + args.attn_flags + args.mmdit_flags,
         args.decomp_attn,
         attn_spec=args.attn_spec,
         input_mlir=args.input_mlir,
diff --git a/models/turbine_models/tests/sd3_test.py b/models/turbine_models/tests/sd3_test.py
new file mode 100644
index 000000000..a495ebf5b
--- /dev/null
+++ b/models/turbine_models/tests/sd3_test.py
@@ -0,0 +1,557 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import pytest
+import torch
+from transformers import CLIPTokenizer
+from turbine_models.custom_models.sd_inference.utils import create_safe_name
+from turbine_models.custom_models.sd3_inference.text_encoder_impls import SD3Tokenizer
+from turbine_models.custom_models.sd3_inference import (
+    sd3_text_encoders,
+    sd3_text_encoders_runner,
+    sd3_mmdit,
+    sd3_mmdit_runner,
+    sd3_vae,
+    sd3_vae_runner,
+    sd3_pipeline,
+    sd3_schedulers,
+)
+from turbine_models.custom_models.sd_inference import utils
+from turbine_models.custom_models.sd3_inference.sd3_text_encoders import (
+    TextEncoderModule,
+)
+from turbine_models.utils.sdxl_benchmark import run_benchmark
+import unittest
+from tqdm.auto import tqdm
+from PIL import Image
+import os
+import numpy as np
+import time
+
+
+torch.random.manual_seed(0)
+
+arguments = {}
+
+
+@pytest.fixture(scope="session")
+def command_line_args(request):
+    arguments["hf_auth_token"] = request.config.getoption("--hf_auth_token")
+    arguments["hf_model_name"] = request.config.getoption("--hf_model_name")
+    arguments["scheduler_id"] = request.config.getoption("--scheduler_id")
+    arguments["model_path"] = request.config.getoption("--model_path")
+    arguments["vae_model_path"] = request.config.getoption("--vae_model_path")
+    arguments["prompt"] = request.config.getoption("--prompt")
+    arguments["negative_prompt"] = request.config.getoption("--negative_prompt")
+    arguments["num_inference_steps"] = int(
+        request.config.getoption("--num_inference_steps")
+    )
+    arguments["guidance_scale"] = float(request.config.getoption("--guidance_scale"))
+    arguments["seed"] = float(request.config.getoption("--seed"))
+    arguments["denoise"] = request.config.getoption("--denoise")
+    arguments["external_weight_path"] = request.config.getoption(
+        "--external_weight_path"
+    )
+    arguments["external_weight_dir"] = request.config.getoption("--external_weight_dir")
+    arguments["external_weight_file"] = request.config.getoption("--external_weight_file")
+    arguments["vmfb_path"] = request.config.getoption("--vmfb_path")
+    arguments["pipeline_vmfb_path"] = request.config.getoption("--pipeline_vmfb_path")
+    arguments["scheduler_vmfb_path"] = request.config.getoption("--scheduler_vmfb_path")
+    arguments["split_scheduler"] = request.config.getoption("--split_scheduler")
+    arguments["cpu_scheduling"] = request.config.getoption("--cpu_scheduling")
+    arguments["pipeline_dir"] = request.config.getoption("--pipeline_dir")
+    arguments["compiled_pipeline"] = request.config.getoption("--compiled_pipeline")
+    arguments["npu_delegate_path"] = request.config.getoption("--npu_delegate_path")
+    arguments["clip_device"] = request.config.getoption("--clip_device")
+    arguments["mmdit_device"] = request.config.getoption("--mmdit_device")
+    arguments["vae_device"] = request.config.getoption("--vae_device")
+    arguments["clip_target"] = request.config.getoption("--clip_target")
+    arguments["vae_target"] = request.config.getoption("--vae_target")
+    arguments["mmdit_target"] = request.config.getoption("--mmdit_target")
+    arguments["batch_size"] = int(request.config.getoption("--batch_size"))
+    arguments["height"] = int(request.config.getoption("--height"))
+    arguments["width"] = int(request.config.getoption("--width"))
+    arguments["precision"] = request.config.getoption("--precision")
+    arguments["vae_precision"] = request.config.getoption("--vae_precision")
+    arguments["max_length"] = int(request.config.getoption("--max_length"))
+    arguments["vae_variant"] = request.config.getoption("--vae_variant")
+    arguments["shift"] = request.config.getoption("--shift")
+    arguments["vae_decomp_attn"] = request.config.getoption("--vae_decomp_attn")
+    arguments["vae_dtype"] = request.config.getoption("--vae_dtype")
+    arguments["external_weights"] = request.config.getoption("--external_weights")
+    arguments["decomp_attn"] = request.config.getoption("--decomp_attn")
+    arguments["exit_on_vmfb"] = request.config.getoption("--exit_on_vmfb")
+    arguments["output"] = request.config.getoption("--output")
+    arguments["attn_spec"] = request.config.getoption("--attn_spec")
+    arguments["device"] = request.config.getoption("--device")
+    arguments["rt_device"] = request.config.getoption("--rt_device")
+    arguments["iree_target_triple"] = request.config.getoption("--iree_target_triple")
+    arguments["ireec_flags"] = request.config.getoption("--ireec_flags")
+    arguments["attn_flags"] = request.config.getoption("--attn_flags")
+    arguments["clip_flags"] = request.config.getoption("--clip_flags")
+    arguments["vae_flags"] = request.config.getoption("--vae_flags")
+    arguments["mmdit_flags"] = request.config.getoption("--mmdit_flags")
+
+@pytest.mark.usefixtures("command_line_args")
+class StableDiffusion3Test(unittest.TestCase):
+    def setUp(self):
+        self.safe_model_name = create_safe_name(arguments["hf_model_name"], "")
+        self.mmdit_model = sd3_mmdit.MMDiTModel(
+            arguments["hf_model_name"],
+            precision=arguments["precision"],
+        )
+        self.vae_model = sd3_vae.VaeModel(
+            # This is a public model, so no auth required
+            arguments["hf_model_name"],
+            custom_vae=(
+                "madebyollin/sdxl-vae-fp16-fix"
+                if arguments["precision"] == "fp16"
+                else None
+            ),
+        )
+
+    def test01_ExportPromptEncoder(self):
+        if arguments["device"] in ["vulkan", "cuda"]:
+            self.skipTest(
+                "Not testing sd3 on vk or cuda"
+            )
+        arguments["external_weight_path"] = (
+            arguments["external_weight_path"] + "/sd3_text_encoders_"+arguments["precision"]+ ".irpa" 
+        )
+        _, prompt_encoder_vmfb = sd3_text_encoders.export_text_encoders(
+            arguments["hf_model_name"],
+            hf_auth_token=None,
+            max_length=arguments["max_length"],
+            precision=arguments["precision"],
+            compile_to="vmfb",
+            external_weights=arguments["external_weights"],
+            external_weight_path=arguments["external_weight_path"],
+            device=arguments["device"],
+            target_triple=arguments["clip_target"],
+            ireec_flags=arguments["ireec_flags"],
+            exit_on_vmfb=True,
+            pipeline_dir=arguments["pipeline_dir"],
+            input_mlir=None,
+            attn_spec=arguments["attn_spec"],
+            output_batchsize=arguments["batch_size"],
+            decomp_attn=arguments["decomp_attn"],
+        )
+        tokenizer = SD3Tokenizer()
+        (
+            text_input_ids_list,
+            uncond_input_ids_list,
+        ) = sd3_text_encoders_runner.run_tokenize(
+            tokenizer,
+            arguments["prompt"],
+            arguments["negative_prompt"],
+        )
+        (
+            turbine_output1,
+            turbine_output2,
+        ) = sd3_text_encoders_runner.run_prompt_encoder(
+            prompt_encoder_vmfb,
+            arguments["rt_device"],
+            arguments["external_weight_path"],
+            text_input_ids_list,
+            uncond_input_ids_list,
+        )
+        torch_encoder_model = TextEncoderModule(
+            arguments["batch_size"],
+        )
+        torch_output1, torch_output2 = torch_encoder_model.forward(
+            *text_input_ids_list, *uncond_input_ids_list
+        )
+        rtol = 4e-2
+        atol = 4e-2
+        np.testing.assert_allclose(torch_output1, turbine_output1, rtol, atol)
+        np.testing.assert_allclose(torch_output2, turbine_output2, rtol, atol)
+
+#    def test02_ExportUnetModel(self):
+#        if arguments["device"] in ["vulkan", "cuda"]:
+#            self.skipTest("Unknown error on vulkan; To be tested on cuda.")
+#        unet.export_unet_model(
+#            unet_model=self.unet_model,
+#            # This is a public model, so no auth required
+#            hf_model_name=arguments["hf_model_name"],
+#            batch_size=arguments["batch_size"],
+#            height=arguments["height"],
+#            width=arguments["width"],
+#            precision=arguments["precision"],
+#            max_length=arguments["max_length"],
+#            hf_auth_token=None,
+#            compile_to="vmfb",
+#            external_weights=arguments["external_weights"],
+#            external_weight_path=self.safe_model_name
+#            + "_"
+#            + arguments["precision"]
+#            + "_unet."
+#            + arguments["external_weights"],
+#            device=arguments["device"],
+#            target_triple=arguments["iree_target_triple"],
+#            ireec_flags=arguments["ireec_flags"],
+#            decomp_attn=arguments["decomp_attn"],
+#            attn_spec=arguments["attn_spec"],
+#        )
+#        arguments["external_weight_path"] = (
+#            self.safe_model_name
+#            + "_"
+#            + arguments["precision"]
+#            + "_unet."
+#            + arguments["external_weights"]
+#        )
+#        arguments["vmfb_path"] = (
+#            self.safe_model_name
+#            + "_"
+#            + str(arguments["max_length"])
+#            + "_"
+#            + str(arguments["height"])
+#            + "x"
+#            + str(arguments["width"])
+#            + "_"
+#            + arguments["precision"]
+#            + "_unet_"
+#            + arguments["device"]
+#            + ".vmfb"
+#        )
+#        dtype = torch.float16 if arguments["precision"] == "fp16" else torch.float32
+#        sample = torch.rand(
+#            (
+#                arguments["batch_size"],
+#                arguments["in_channels"],
+#                arguments["height"] // 8,
+#                arguments["width"] // 8,
+#            ),
+#            dtype=dtype,
+#        )
+#        timestep = torch.zeros(1, dtype=torch.int64)
+#        prompt_embeds = torch.rand(
+#            (2 * arguments["batch_size"], arguments["max_length"], 2048),
+#            dtype=dtype,
+#        )
+#        text_embeds = torch.rand(2 * arguments["batch_size"], 1280, dtype=dtype)
+#        time_ids = torch.zeros(2 * arguments["batch_size"], 6, dtype=dtype)
+#        guidance_scale = torch.Tensor([arguments["guidance_scale"]]).to(dtype)
+#
+#        turbine = unet_runner.run_unet(
+#            arguments["rt_device"],
+#            sample,
+#            timestep,
+#            prompt_embeds,
+#            text_embeds,
+#            time_ids,
+#            guidance_scale,
+#            arguments["vmfb_path"],
+#            arguments["hf_model_name"],
+#            arguments["hf_auth_token"],
+#            arguments["external_weight_path"],
+#        )
+#        torch_output = unet_runner.run_torch_unet(
+#            arguments["hf_model_name"],
+#            arguments["hf_auth_token"],
+#            sample.float(),
+#            timestep,
+#            prompt_embeds.float(),
+#            text_embeds.float(),
+#            time_ids.float(),
+#            guidance_scale.float(),
+#            precision=arguments["precision"],
+#        )
+#        if arguments["benchmark"] or arguments["tracy_profile"]:
+#            run_benchmark(
+#                "unet",
+#                arguments["vmfb_path"],
+#                arguments["external_weight_path"],
+#                arguments["rt_device"],
+#                max_length=arguments["max_length"],
+#                height=arguments["height"],
+#                width=arguments["width"],
+#                batch_size=arguments["batch_size"],
+#                in_channels=arguments["in_channels"],
+#                precision=arguments["precision"],
+#                tracy_profile=arguments["tracy_profile"],
+#            )
+#        rtol = 4e-2
+#        atol = 4e-1
+#
+#        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
+#
+#    def test03_ExportVaeModelDecode(self):
+#        if arguments["device"] in ["vulkan", "cuda"]:
+#            self.skipTest("Compilation error on vulkan; To be tested on cuda.")
+#        vae.export_vae_model(
+#            vae_model=self.vae_model,
+#            # This is a public model, so no auth required
+#            hf_model_name=arguments["hf_model_name"],
+#            batch_size=arguments["batch_size"],
+#            height=arguments["height"],
+#            width=arguments["width"],
+#            precision=arguments["precision"],
+#            compile_to="vmfb",
+#            external_weights=arguments["external_weights"],
+#            external_weight_path=self.safe_model_name
+#            + "_"
+#            + arguments["precision"]
+#            + "_vae_decode."
+#            + arguments["external_weights"],
+#            device=arguments["device"],
+#            target_triple=arguments["iree_target_triple"],
+#            ireec_flags=arguments["ireec_flags"],
+#            variant="decode",
+#            decomp_attn=arguments["decomp_attn"],
+#            attn_spec=arguments["attn_spec"],
+#            exit_on_vmfb=True,
+#        )
+#        arguments["external_weight_path"] = (
+#            self.safe_model_name
+#            + "_"
+#            + arguments["precision"]
+#            + "_vae_decode."
+#            + arguments["external_weights"]
+#        )
+#        arguments["vmfb_path"] = (
+#            self.safe_model_name
+#            + "_"
+#            + str(arguments["height"])
+#            + "x"
+#            + str(arguments["width"])
+#            + "_"
+#            + arguments["precision"]
+#            + "_vae_decode_"
+#            + arguments["device"]
+#            + ".vmfb"
+#        )
+#        example_input = torch.ones(
+#            arguments["batch_size"],
+#            4,
+#            arguments["height"] // 8,
+#            arguments["width"] // 8,
+#            dtype=torch.float32,
+#        )
+#        example_input_torch = example_input
+#        if arguments["precision"] == "fp16":
+#            example_input = example_input.half()
+#        turbine = vae_runner.run_vae(
+#            arguments["rt_device"],
+#            example_input,
+#            arguments["vmfb_path"],
+#            arguments["hf_model_name"],
+#            arguments["external_weight_path"],
+#        )
+#        torch_output = vae_runner.run_torch_vae(
+#            arguments["hf_model_name"],
+#            (
+#                "madebyollin/sdxl-vae-fp16-fix"
+#                if arguments["precision"] == "fp16"
+#                else ""
+#            ),
+#            "decode",
+#            example_input_torch,
+#        )
+#        if arguments["benchmark"] or arguments["tracy_profile"]:
+#            run_benchmark(
+#                "vae_decode",
+#                arguments["vmfb_path"],
+#                arguments["external_weight_path"],
+#                arguments["rt_device"],
+#                height=arguments["height"],
+#                width=arguments["width"],
+#                precision=arguments["precision"],
+#                tracy_profile=arguments["tracy_profile"],
+#            )
+#        rtol = 4e-2
+#        atol = 4e-1
+#
+#        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
+#
+#    def test04_ExportVaeModelEncode(self):
+#        if arguments["device"] in ["cpu", "vulkan", "cuda", "rocm"]:
+#            self.skipTest(
+#                "Compilation error on cpu, vulkan and rocm; To be tested on cuda."
+#            )
+#        vae.export_vae_model(
+#            vae_model=self.vae_model,
+#            # This is a public model, so no auth required
+#            hf_model_name=arguments["hf_model_name"],
+#            batch_size=arguments["batch_size"],
+#            height=arguments["height"],
+#            width=arguments["width"],
+#            precision=arguments["precision"],
+#            compile_to="vmfb",
+#            external_weights=arguments["external_weights"],
+#            external_weight_path=self.safe_model_name
+#            + "_"
+#            + arguments["precision"]
+#            + "_vae_encode."
+#            + arguments["external_weights"],
+#            device=arguments["device"],
+#            target_triple=arguments["iree_target_triple"],
+#            ireec_flags=arguments["ireec_flags"],
+#            variant="encode",
+#            decomp_attn=arguments["decomp_attn"],
+#            exit_on_vmfb=True,
+#        )
+#        arguments["external_weight_path"] = (
+#            self.safe_model_name
+#            + "_"
+#            + arguments["precision"]
+#            + "_vae_encode."
+#            + arguments["external_weights"]
+#        )
+#        arguments["vmfb_path"] = (
+#            self.safe_model_name
+#            + "_"
+#            + str(arguments["height"])
+#            + "x"
+#            + str(arguments["width"])
+#            + "_"
+#            + arguments["precision"]
+#            + "_vae_encode_"
+#            + arguments["device"]
+#            + ".vmfb"
+#        )
+#        example_input = torch.ones(
+#            arguments["batch_size"],
+#            3,
+#            arguments["height"],
+#            arguments["width"],
+#            dtype=torch.float32,
+#        )
+#        example_input_torch = example_input
+#        if arguments["precision"] == "fp16":
+#            example_input = example_input.half()
+#        turbine = vae_runner.run_vae(
+#            arguments["rt_device"],
+#            example_input,
+#            arguments["vmfb_path"],
+#            arguments["hf_model_name"],
+#            arguments["external_weight_path"],
+#        )
+#        torch_output = vae_runner.run_torch_vae(
+#            arguments["hf_model_name"],
+#            (
+#                "madebyollin/sdxl-vae-fp16-fix"
+#                if arguments["precision"] == "fp16"
+#                else ""
+#            ),
+#            "encode",
+#            example_input_torch,
+#        )
+#        if arguments["benchmark"] or arguments["tracy_profile"]:
+#            run_benchmark(
+#                "vae_encode",
+#                arguments["vmfb_path"],
+#                arguments["external_weight_path"],
+#                arguments["rt_device"],
+#                height=arguments["height"],
+#                width=arguments["width"],
+#                precision=arguments["precision"],
+#                tracy_profile=arguments["tracy_profile"],
+#            )
+#        rtol = 4e-2
+#        atol = 4e-2
+#        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
+#
+#    def test05_t2i_generate_images(self):
+#        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
+#            self.skipTest(
+#                "Have issues with submodels on vulkan, cuda; ROCM hangs on mi250 despite submodels working."
+#            )
+#        mlirs = {
+#            "vae_decode": None,
+#            "prompt_encoder": None,
+#            "scheduled_unet": None,
+#            "pipeline": None,
+#            "full_pipeline": None,
+#        }
+#        vmfbs = {
+#            "vae_decode": None,
+#            "prompt_encoder": None,
+#            "scheduled_unet": None,
+#            "pipeline": None,
+#            "full_pipeline": None,
+#        }
+#        weights = {
+#            "vae_decode": None,
+#            "prompt_encoder": None,
+#            "scheduled_unet": None,
+#            "pipeline": None,
+#            "full_pipeline": None,
+#        }
+#
+#        if not arguments["pipeline_dir"]:
+#            pipe_id_list = [
+#                "sdxl_1_0",
+#                str(arguments["height"]),
+#                str(arguments["width"]),
+#                str(arguments["max_length"]),
+#                arguments["precision"],
+#                arguments["device"],
+#            ]
+#            arguments["pipeline_dir"] = os.path.join(
+#                ".",
+#                "_".join(pipe_id_list),
+#            )
+#        ireec_flags = {
+#            "unet": arguments["ireec_flags"],
+#            "vae": arguments["ireec_flags"],
+#            "clip": arguments["ireec_flags"],
+#            "pipeline": arguments["ireec_flags"],
+#        }
+#        user_mlir_list = []
+#        for submodel_id, mlir_path in zip(mlirs.keys(), user_mlir_list):
+#            if submodel_id in mlir_path:
+#                mlirs[submodel_id] = mlir_path
+#        external_weights_dir = arguments["pipeline_dir"]
+#        sdxl_pipe = sdxl_compiled_pipeline.SharkSDXLPipeline(
+#            arguments["hf_model_name"],
+#            arguments["scheduler_id"],
+#            arguments["height"],
+#            arguments["width"],
+#            arguments["precision"],
+#            arguments["max_length"],
+#            arguments["batch_size"],
+#            arguments["num_inference_steps"],
+#            arguments["device"],
+#            arguments["iree_target_triple"],
+#            ireec_flags,
+#            arguments["attn_spec"],
+#            arguments["decomp_attn"],
+#            arguments["pipeline_dir"],
+#            external_weights_dir,
+#            arguments["external_weights"],
+#        )
+#        vmfbs, weights = sdxl_pipe.check_prepared(
+#            mlirs, vmfbs, weights, interactive=False
+#        )
+#        sdxl_pipe.load_pipeline(
+#            vmfbs, weights, arguments["rt_device"], arguments["compiled_pipeline"]
+#        )
+#        sdxl_pipe.generate_images(
+#            arguments["prompt"],
+#            arguments["negative_prompt"],
+#            1,
+#            arguments["guidance_scale"],
+#            arguments["seed"],
+#        )
+#        print("Image generation complete.")
+#        os.remove(os.path.join(arguments["pipeline_dir"], "prompt_encoder.vmfb"))
+#        os.remove(
+#            os.path.join(
+#                arguments["pipeline_dir"],
+#                arguments["scheduler_id"]
+#                + "_unet_"
+#                + str(arguments["num_inference_steps"])
+#                + ".vmfb",
+#            )
+#        )
+#        os.remove(os.path.join(arguments["pipeline_dir"], "vae_decode.vmfb"))
+#        os.remove(os.path.join(arguments["pipeline_dir"], "full_pipeline.vmfb"))
+#
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    unittest.main()

From fc6833d2d64844b913f8acebb70815d2ba46c900 Mon Sep 17 00:00:00 2001
From: dan <danimal197@gmail.com>
Date: Wed, 19 Jun 2024 14:38:12 -0500
Subject: [PATCH 137/174] add it to workflow to test

---
 .github/workflows/test_models.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index 184b1458f..03872dea3 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -73,3 +73,4 @@ jobs:
           pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
           pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
           pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default --batch_size 2
+          pytest -v models/turbine_models/tests/sd3_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 5
\ No newline at end of file

From 4e2c3cc39a8421a553c1e362680a7b84877bca3b Mon Sep 17 00:00:00 2001
From: dan <danimal197@gmail.com>
Date: Wed, 19 Jun 2024 16:55:37 -0500
Subject: [PATCH 138/174] add tests for mmdit and vae

---
 models/turbine_models/tests/sd3_test.py | 345 +++++++++++-------------
 1 file changed, 161 insertions(+), 184 deletions(-)

diff --git a/models/turbine_models/tests/sd3_test.py b/models/turbine_models/tests/sd3_test.py
index a495ebf5b..681cef403 100644
--- a/models/turbine_models/tests/sd3_test.py
+++ b/models/turbine_models/tests/sd3_test.py
@@ -170,96 +170,73 @@ def test01_ExportPromptEncoder(self):
         np.testing.assert_allclose(torch_output1, turbine_output1, rtol, atol)
         np.testing.assert_allclose(torch_output2, turbine_output2, rtol, atol)
 
-#    def test02_ExportUnetModel(self):
-#        if arguments["device"] in ["vulkan", "cuda"]:
-#            self.skipTest("Unknown error on vulkan; To be tested on cuda.")
-#        unet.export_unet_model(
-#            unet_model=self.unet_model,
-#            # This is a public model, so no auth required
-#            hf_model_name=arguments["hf_model_name"],
-#            batch_size=arguments["batch_size"],
-#            height=arguments["height"],
-#            width=arguments["width"],
-#            precision=arguments["precision"],
-#            max_length=arguments["max_length"],
-#            hf_auth_token=None,
-#            compile_to="vmfb",
-#            external_weights=arguments["external_weights"],
-#            external_weight_path=self.safe_model_name
-#            + "_"
-#            + arguments["precision"]
-#            + "_unet."
-#            + arguments["external_weights"],
-#            device=arguments["device"],
-#            target_triple=arguments["iree_target_triple"],
-#            ireec_flags=arguments["ireec_flags"],
-#            decomp_attn=arguments["decomp_attn"],
-#            attn_spec=arguments["attn_spec"],
-#        )
-#        arguments["external_weight_path"] = (
-#            self.safe_model_name
-#            + "_"
-#            + arguments["precision"]
-#            + "_unet."
-#            + arguments["external_weights"]
-#        )
-#        arguments["vmfb_path"] = (
-#            self.safe_model_name
-#            + "_"
-#            + str(arguments["max_length"])
-#            + "_"
-#            + str(arguments["height"])
-#            + "x"
-#            + str(arguments["width"])
-#            + "_"
-#            + arguments["precision"]
-#            + "_unet_"
-#            + arguments["device"]
-#            + ".vmfb"
-#        )
-#        dtype = torch.float16 if arguments["precision"] == "fp16" else torch.float32
-#        sample = torch.rand(
-#            (
-#                arguments["batch_size"],
-#                arguments["in_channels"],
-#                arguments["height"] // 8,
-#                arguments["width"] // 8,
-#            ),
-#            dtype=dtype,
-#        )
-#        timestep = torch.zeros(1, dtype=torch.int64)
-#        prompt_embeds = torch.rand(
-#            (2 * arguments["batch_size"], arguments["max_length"], 2048),
-#            dtype=dtype,
-#        )
-#        text_embeds = torch.rand(2 * arguments["batch_size"], 1280, dtype=dtype)
-#        time_ids = torch.zeros(2 * arguments["batch_size"], 6, dtype=dtype)
-#        guidance_scale = torch.Tensor([arguments["guidance_scale"]]).to(dtype)
-#
-#        turbine = unet_runner.run_unet(
-#            arguments["rt_device"],
-#            sample,
-#            timestep,
-#            prompt_embeds,
-#            text_embeds,
-#            time_ids,
-#            guidance_scale,
-#            arguments["vmfb_path"],
-#            arguments["hf_model_name"],
-#            arguments["hf_auth_token"],
-#            arguments["external_weight_path"],
-#        )
-#        torch_output = unet_runner.run_torch_unet(
-#            arguments["hf_model_name"],
-#            arguments["hf_auth_token"],
-#            sample.float(),
-#            timestep,
-#            prompt_embeds.float(),
-#            text_embeds.float(),
-#            time_ids.float(),
-#            guidance_scale.float(),
-#            precision=arguments["precision"],
-#        )
+    def test02_ExportMMDITModel(self):
+        if arguments["device"] in ["vulkan", "cuda"]:
+            self.skipTest("Not testing on vulkan or cuda")
+        arguments["external_weight_path"] = (
+            self.safe_model_name
+            + "_"
+            + arguments["precision"]
+            + "_mmdit."
+            + arguments["external_weights"]
+        )
+        sd3_mmdit.export_mmdit_model(
+            mmdit_model=self.mmdit_model,
+            # This is a public model, so no auth required
+            hf_model_name=arguments["hf_model_name"],
+            batch_size=arguments["batch_size"],
+            height=arguments["height"],
+            width=arguments["width"],
+            precision=arguments["precision"],
+            max_length=arguments["max_length"],
+            hf_auth_token=None,
+            compile_to="vmfb",
+            external_weights=arguments["external_weights"],
+            external_weight_path=arguments["external_weight_path"],
+            device=arguments["mmdit_device"],
+            target_triple=arguments["iree_target_triple"],
+            ireec_flags=arguments["ireec_flags"],
+            decomp_attn=arguments["decomp_attn"],
+            attn_spec=arguments["attn_spec"],
+        )
+        arguments["vmfb_path"] = (
+            self.safe_model_name
+            + "_"
+            + str(arguments["max_length"])
+            + "_"
+            + str(arguments["height"])
+            + "x"
+            + str(arguments["width"])
+            + "_"
+            + arguments["precision"]
+            + "_unet_"
+            + arguments["device"]
+            + ".vmfb"
+        )
+        dtype = torch.float16 if arguments["precision"] == "fp16" else torch.float32
+
+        hidden_states = torch.randn(
+        (arguments["batch_size"], 16, arguments["height"] // 8, arguments["width"] // 8), dtype=dtype
+    )
+        encoder_hidden_states = torch.randn(
+            (arguments["batch_size"], arguments["max_length"] * 2, 4096), dtype=dtype
+        )
+        pooled_projections = torch.randn((arguments["batch_size"], 2048), dtype=dtype)
+        timestep = torch.tensor([0, 0], dtype=dtype)
+        turbine = sd3_mmdit_runner.run_mmdit_turbine(
+            hidden_states,
+            encoder_hidden_states,
+            pooled_projections,
+            timestep,
+            arguments,
+        )
+        torch_output = sd3_mmdit_runner.run_diffusers_mmdit(
+            hidden_states,
+            encoder_hidden_states,
+            pooled_projections,
+            timestep,
+            arguments,
+        )
 #        if arguments["benchmark"] or arguments["tracy_profile"]:
 #            run_benchmark(
 #                "unet",
@@ -274,99 +251,99 @@ def test01_ExportPromptEncoder(self):
 #                precision=arguments["precision"],
 #                tracy_profile=arguments["tracy_profile"],
 #            )
-#        rtol = 4e-2
-#        atol = 4e-1
-#
-#        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
-#
-#    def test03_ExportVaeModelDecode(self):
-#        if arguments["device"] in ["vulkan", "cuda"]:
-#            self.skipTest("Compilation error on vulkan; To be tested on cuda.")
-#        vae.export_vae_model(
-#            vae_model=self.vae_model,
-#            # This is a public model, so no auth required
-#            hf_model_name=arguments["hf_model_name"],
-#            batch_size=arguments["batch_size"],
-#            height=arguments["height"],
-#            width=arguments["width"],
-#            precision=arguments["precision"],
-#            compile_to="vmfb",
-#            external_weights=arguments["external_weights"],
-#            external_weight_path=self.safe_model_name
-#            + "_"
-#            + arguments["precision"]
-#            + "_vae_decode."
-#            + arguments["external_weights"],
-#            device=arguments["device"],
-#            target_triple=arguments["iree_target_triple"],
-#            ireec_flags=arguments["ireec_flags"],
-#            variant="decode",
-#            decomp_attn=arguments["decomp_attn"],
-#            attn_spec=arguments["attn_spec"],
-#            exit_on_vmfb=True,
-#        )
-#        arguments["external_weight_path"] = (
-#            self.safe_model_name
-#            + "_"
-#            + arguments["precision"]
-#            + "_vae_decode."
-#            + arguments["external_weights"]
-#        )
-#        arguments["vmfb_path"] = (
-#            self.safe_model_name
-#            + "_"
-#            + str(arguments["height"])
-#            + "x"
-#            + str(arguments["width"])
-#            + "_"
-#            + arguments["precision"]
-#            + "_vae_decode_"
-#            + arguments["device"]
-#            + ".vmfb"
-#        )
-#        example_input = torch.ones(
-#            arguments["batch_size"],
-#            4,
-#            arguments["height"] // 8,
-#            arguments["width"] // 8,
-#            dtype=torch.float32,
-#        )
-#        example_input_torch = example_input
-#        if arguments["precision"] == "fp16":
-#            example_input = example_input.half()
-#        turbine = vae_runner.run_vae(
-#            arguments["rt_device"],
-#            example_input,
-#            arguments["vmfb_path"],
-#            arguments["hf_model_name"],
-#            arguments["external_weight_path"],
-#        )
-#        torch_output = vae_runner.run_torch_vae(
-#            arguments["hf_model_name"],
-#            (
-#                "madebyollin/sdxl-vae-fp16-fix"
-#                if arguments["precision"] == "fp16"
-#                else ""
-#            ),
-#            "decode",
-#            example_input_torch,
-#        )
-#        if arguments["benchmark"] or arguments["tracy_profile"]:
-#            run_benchmark(
-#                "vae_decode",
-#                arguments["vmfb_path"],
-#                arguments["external_weight_path"],
-#                arguments["rt_device"],
-#                height=arguments["height"],
-#                width=arguments["width"],
-#                precision=arguments["precision"],
-#                tracy_profile=arguments["tracy_profile"],
-#            )
-#        rtol = 4e-2
-#        atol = 4e-1
-#
-#        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
-#
+        rtol = 4e-2
+        atol = 4e-1
+
+        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
+
+    def test03_ExportVaeModelDecode(self):
+        if arguments["device"] in ["vulkan", "cuda"]:
+            self.skipTest("not testing vulkan or cuda")
+        sd3_vae.export_vae_model(
+            vae_model=self.vae_model,
+            # This is a public model, so no auth required
+            exit_on_vmfb=True,
+        )
+    
+        arguments["external_weight_path"] = (
+            self.safe_model_name
+            + "_"
+            + arguments["precision"]
+            + "_vae_decode."
+            + arguments["external_weights"]
+        )
+        sd3_vae.export_vae_model(
+            self.vae_model,
+            hf_model_name=arguments["hf_model_name"],
+            batch_size=arguments["batch_size"],
+            height=arguments["height"],
+            width=arguments["width"],
+            precision=arguments["precision"],
+            compile_to="vmfb",
+            external_weights=arguments["external_weights"],
+            external_weight_path=arguments["external_weight_path"],
+            device=arguments["device"],
+            target_triple=arguments["iree_target_triple"],
+            ireec_flags=arguments["ireec_flags"],
+            variant="decode",
+            decomp_attn=arguments["decomp_attn"],
+            attn_spec=arguments["attn_spec"],
+        )
+        arguments["vmfb_path"] = (
+            self.safe_model_name
+            + "_"
+            + str(arguments["height"])
+            + "x"
+            + str(arguments["width"])
+            + "_"
+            + arguments["precision"]
+            + "_vae_decode_"
+            + arguments["device"]
+            + ".vmfb"
+        )
+        example_input = torch.ones(
+            arguments["batch_size"],
+            16,
+            arguments["height"] // 8,
+            arguments["width"] // 8,
+            dtype=torch.float32,
+        )
+        example_input_torch = example_input
+        if arguments["precision"] == "fp16":
+            example_input = example_input.half()
+        turbine = sd3_vae_runner.run_vae(
+            arguments["rt_device"],
+            example_input,
+            arguments["vmfb_path"],
+            arguments["hf_model_name"],
+            arguments["external_weight_path"],
+        )
+        torch_output = sd3_vae_runner.run_torch_vae(
+            arguments["hf_model_name"],
+            (
+                "madebyollin/sdxl-vae-fp16-fix"
+                if arguments["precision"] == "fp16"
+                else ""
+            ),
+            "decode",
+            example_input_torch,
+        )
+        #if arguments["benchmark"] or arguments["tracy_profile"]:
+        #    run_benchmark(
+        #        "vae_decode",
+        #        arguments["vmfb_path"],
+        #        arguments["external_weight_path"],
+        #        arguments["rt_device"],
+        #        height=arguments["height"],
+        #        width=arguments["width"],
+        #        precision=arguments["precision"],
+        #        tracy_profile=arguments["tracy_profile"],
+        #    )
+        rtol = 4e-2
+        atol = 4e-1
+
+        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
+
 #    def test04_ExportVaeModelEncode(self):
 #        if arguments["device"] in ["cpu", "vulkan", "cuda", "rocm"]:
 #            self.skipTest(
@@ -454,7 +431,7 @@ def test01_ExportPromptEncoder(self):
 #        rtol = 4e-2
 #        atol = 4e-2
 #        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
-#
+
 #    def test05_t2i_generate_images(self):
 #        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
 #            self.skipTest(

From 3c83a21144078bcd77bec2637fb2953f163ad628 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 20 Jun 2024 01:02:14 -0500
Subject: [PATCH 139/174] Allow schedulers to load/reload at generate_images

---
 .../sd3_inference/sd3_pipeline.py             |  9 ++---
 .../sd3_inference/sd3_schedulers.py           |  2 +-
 .../custom_models/sd_inference/schedulers.py  | 10 +++---
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 36 ++++++++++++++-----
 4 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 15b53a96c..99d14055f 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -176,23 +176,24 @@ def check_prepared(
 
     def is_prepared(self, vmfbs, weights):
         missing = []
-        height = self.height
-        width = self.width
+        dims = f"{str(self.width)}x{str(self.height)}"
         for key in vmfbs:
             if key == "scheduler":
                 continue
             elif key == "vae":
-                keywords = ["vae", self.vae_precision, height, width]
+                keywords = ["vae", self.vae_precision, dims]
                 device_key = "vae"
             elif key == "clip":
                 keywords = ["text_encoders", self.precision, self.max_length]
                 device_key = "clip"
             else:
-                keywords = [key, self.precision, self.max_length, height, width]
+                keywords = [key, self.precision, self.max_length, dims]
                 device_key = key
             avail_files = os.listdir(self.pipeline_dir)
             keywords.append("vmfb")
+            keywords.append(utils.create_safe_name(self.hf_model_name, ""))
             keywords.append(self.devices[device_key]["target"])
+            print(keywords)
             for filename in avail_files:
                 if all(str(x) in filename for x in keywords):
                     vmfbs[key] = os.path.join(self.pipeline_dir, filename)
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 7c6acbd03..26dbfb8f3 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -208,7 +208,7 @@ def export_scheduler_model(
     scheduler_module = FlowSchedulingModel(hf_model_name, num_inference_steps, dtype)
     vmfb_names = [
         "EulerFlowScheduler",
-        f"bs{args.batch_size}_{args.height}x{args.width}",
+        f"bs{batch_size}_{height}x{width}",
         precision,
         str(num_inference_steps),
         target_triple,
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 2b258a950..d1e4b0028 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -136,15 +136,17 @@ def __init__(
         self.dest = dest_device
         self.dtype = latents_dtype
         self.batch_size = batch_size
-        self.module.set_timesteps(num_inference_steps)
-        self.timesteps = self.module.timesteps
+        self.timesteps = None
         self.torch_dtype = (
             torch.float32 if latents_dtype == "float32" else torch.float16
         )
 
-    def initialize(self, sample):
+    def initialize(self, sample, num_inference_steps):
         if isinstance(sample, ireert.DeviceArray):
             sample = torch.tensor(sample.to_host(), dtype=torch.float32)
+
+        self.module.set_timesteps(num_inference_steps)
+        self.timesteps = self.module.timesteps
         height = sample.shape[2] * 8
         width = sample.shape[3] * 8
         original_size = (height, width)
@@ -157,7 +159,7 @@ def initialize(self, sample):
             add_time_ids = add_time_ids.repeat(self.batch_size, 1).type(
                 self.torch_dtype
             )
-        step_indexes = torch.tensor(len(self.module.timesteps))
+        step_indexes = torch.tensor(len(self.timesteps))
         timesteps = self.timesteps
         sample = sample * self.module.init_noise_sigma
         add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 61679c604..2bbff8a33 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -118,8 +118,10 @@ def __init__(
         self.vae_dtype = "float32" if vae_precision == "fp32" else "float16"
         self.custom_vae = custom_vae
         self.cpu_scheduling = cpu_scheduling
+        self.compiled_pipeline = False
         # TODO: set this based on user-inputted guidance scale and negative prompt.
         self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
+        self._interrupt = False
 
     # FILE MANAGEMENT AND PIPELINE SETUP
 
@@ -145,6 +147,9 @@ def check_prepared(
                     if vmfbs[submodel] == None:
                         print("Fetching: ", submodel)
                         vmfb, weight = self.export_submodel(submodel, input_mlir=mlirs)
+                        if self._interrupt:
+                            self._interrupt = False
+                            return None, None
                         vmfbs[submodel] = vmfb
                         if weights[submodel] is None:
                             weights[submodel] = weight
@@ -166,25 +171,25 @@ def check_prepared(
 
     def is_prepared(self, vmfbs, weights):
         missing = []
-        height = self.height
-        width = self.width
+        dims = f"{str(self.width)}x{str(self.height)}"
         for key in vmfbs:
             if key == "scheduled_unet":
-                keywords = ["unet", self.scheduler_id, self.num_inference_steps, self.precision, height, width]
+                keywords = ["unet", self.scheduler_id, self.num_inference_steps, self.precision, dims]
                 device_key = "unet"
             elif key == "scheduler":
                 continue
             elif key == "vae_decode":
-                keywords = ["vae", self.vae_precision, height, width]
+                keywords = ["vae", self.vae_precision, dims]
                 device_key = "vae"
             elif key == "prompt_encoder":
                 keywords = ["prompt_encoder", self.precision, self.max_length]
                 device_key = "clip"
             else:
-                keywords = [key, self.precision, self.max_length, height, width]
+                keywords = [key, self.precision, self.max_length, dims]
                 device_key = key
             avail_files = os.listdir(self.pipeline_dir)
             keywords.append("vmfb")
+            keywords.append(utils.create_safe_name(self.hf_model_name.split("/")[-1], ""))
             keywords.append(self.devices[device_key]["target"])
             for filename in avail_files:
                 if all(str(x) in filename for x in keywords):
@@ -272,6 +277,8 @@ def export_submodel(
         input_mlir: str = None,
         weights_only: bool = False,
     ):
+        if self._interrupt:
+            return None, None
         if not os.path.exists(self.pipeline_dir):
             os.makedirs(self.pipeline_dir)
         if self.external_weights and self.external_weights_dir:
@@ -760,11 +767,19 @@ def generate_images(
             encode_prompts_end = time.time()
 
             for i in range(batch_count):
+                if self._interrupt:
+                    self._interrupt = False
+                    return None
                 unet_start = time.time()
                 if self.split_scheduler:
-                    sample, time_ids, steps, timesteps = self.runners[
-                        "scheduler"
-                    ].initialize(samples[i])
+                    if self.cpu_scheduling:
+                        sample, time_ids, steps, timesteps = self.runners[
+                            "scheduler"
+                        ].initialize(samples[i], self.num_inference_steps)
+                    else:
+                        sample, time_ids, steps, timesteps = self.runners[
+                            "scheduler"
+                        ].initialize(samples[i])
                     iree_inputs = [
                         sample,
                         ireert.asdevicearray(
@@ -777,6 +792,9 @@ def generate_images(
                         None,
                     ]
                     for s in range(steps):
+                        if self._interrupt:
+                            self._interrupt = False
+                            return None
                         # print(f"step {s}")
                         if self.cpu_scheduling:
                             step_index = s
@@ -829,9 +847,11 @@ def generate_images(
                         dtype=self.vae_dtype,
                     )
                 vae_start = time.time()
+                #print(latents.to_host()[0,0,:])
                 vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
                     latents
                 )
+                #print(vae_out.to_host()[0,0,:])
 
                 pipe_end = time.time()
 

From 9bbbafcee6020fce1fa430da97f7fd930d3e36fc Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 20 Jun 2024 12:11:17 -0500
Subject: [PATCH 140/174] Update linter and reformat.

---
 .github/workflows/lint.yml                    |  2 +-
 .../sd3_inference/sd3_pipeline.py             | 44 +++++++++-----
 .../sd3_inference/sd3_schedulers.py           | 14 ++++-
 .../sd3_inference/sd3_text_encoders.py        |  3 +-
 .../sd3_inference/sd3_vae_runner.py           |  8 +--
 .../custom_models/sd_inference/schedulers.py  |  3 +-
 .../custom_models/sd_inference/utils.py       | 10 ++--
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 48 +++++++++++-----
 .../sdxl_inference/sdxl_prompt_encoder.py     |  3 +-
 .../sdxl_inference/sdxl_scheduled_unet.py     |  4 +-
 models/turbine_models/tests/sd3_test.py       | 57 +++++++++++--------
 11 files changed, 123 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b718d0832..6f2e388d1 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -21,7 +21,7 @@ jobs:
         run: git fetch --no-tags --prune --depth=1 origin "${GITHUB_BASE_REF?}:${GITHUB_BASE_REF?}"
       - name: Install black
         run: |
-          python3 -m pip install black==23.3
+          python3 -m pip install black
       - name: Check if modified files are formatted
         run: |
           # The filter lowercase `d` means to exclude deleted files.
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 99d14055f..ce57dbe15 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -63,9 +63,8 @@ def __init__(
         vae_decomp_attn: bool = False,
         cpu_scheduling: bool = False,
         vae_precision: str = "fp32",
-        scheduler_id: str = None, #compatibility only, always uses EulerFlowScheduler
+        scheduler_id: str = None,  # compatibility only, always uses EulerFlowScheduler
         shift: float = 1.0,
-
     ):
         self.hf_model_name = hf_model_name
         # self.scheduler_id = scheduler_id
@@ -131,6 +130,7 @@ def __init__(
         # TODO: set this based on user-inputted guidance scale and negative prompt.
         self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
         self._interrupt = False
+
     # FILE MANAGEMENT AND PIPELINE SETUP
 
     def check_prepared(
@@ -211,7 +211,8 @@ def is_prepared(self, vmfbs, weights):
             )
             if w_key == "clip":
                 default_name = os.path.join(
-                    self.external_weights_dir, f"sd3_text_encoders_{self.precision}.irpa"
+                    self.external_weights_dir,
+                    f"sd3_text_encoders_{self.precision}.irpa",
                 )
             if w_key == "mmdit":
                 default_name = os.path.join(
@@ -269,7 +270,8 @@ def export_submodel(
             if not os.path.exists(self.external_weights_dir):
                 os.makedirs(self.external_weights_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.external_weights_dir, f"sd3_vae_{self.vae_precision}." + self.external_weights
+                self.external_weights_dir,
+                f"sd3_vae_{self.vae_precision}." + self.external_weights,
             )
             mmdit_external_weight_path = os.path.join(
                 self.external_weights_dir,
@@ -292,7 +294,8 @@ def export_submodel(
             if not os.path.exists(self.pipeline_dir):
                 os.makedirs(self.pipeline_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.pipeline_dir, f"sd3_vae_{self.vae_precision}." + self.external_weights
+                self.pipeline_dir,
+                f"sd3_vae_{self.vae_precision}." + self.external_weights,
             )
             mmdit_external_weight_path = os.path.join(
                 self.pipeline_dir,
@@ -481,7 +484,9 @@ def generate_images(
         scheduler_id: str = None,
         progress=None,
     ):
-        needs_new_scheduler = (steps and steps != self.num_inference_steps) or cpu_scheduling != self.cpu_scheduling
+        needs_new_scheduler = (
+            steps and steps != self.num_inference_steps
+        ) or cpu_scheduling != self.cpu_scheduling
         self.cpu_scheduling = cpu_scheduling
         if steps:
             self.num_inference_steps = steps
@@ -490,7 +495,7 @@ def generate_images(
             self.num_inference_steps = steps
             scheduler_path = f"EulerFlowScheduler_{self.num_inference_steps}"
             if not os.path.exists(scheduler_path):
-               scheduler_path, _ = self.export_submodel("scheduler") 
+                scheduler_path, _ = self.export_submodel("scheduler")
             try:
                 self.runners["scheduler"] = sd3_schedulers.SharkSchedulerWrapper(
                     self.devices["mmdit"]["driver"],
@@ -580,19 +585,20 @@ def generate_images(
         if self.cpu_scheduling:
             timesteps, num_inference_steps = sd3_schedulers.retrieve_timesteps(
                 self.runners["scheduler"],
-                num_inference_steps=steps, 
+                num_inference_steps=steps,
                 timesteps=None,
             )
             steps = num_inference_steps
 
-
         for i in range(batch_count):
             if self._interrupt:
                 self._interrupt = False
                 return
             unet_start = time.time()
             if not self.cpu_scheduling:
-                latents, steps, timesteps = self.runners["scheduler"].initialize(samples[i])
+                latents, steps, timesteps = self.runners["scheduler"].initialize(
+                    samples[i]
+                )
             else:
                 latents = torch.tensor(samples[i].to_host(), dtype=self.torch_dtype)
             iree_inputs = [
@@ -607,7 +613,9 @@ def generate_images(
                 ),
                 None,
             ]
-            for s in tqdm(iterable=range(steps), desc=f"Inference steps ({steps}), batch {i+1}"):
+            for s in tqdm(
+                iterable=range(steps), desc=f"Inference steps ({steps}), batch {i+1}"
+            ):
                 if self._interrupt:
                     self._interrupt = False
                     return
@@ -640,7 +648,7 @@ def generate_images(
                     )
                     t = ireert.asdevicearray(
                         self.runners["scheduler"].runner.config.device,
-                        timestep.to_host()[0]
+                        timestep.to_host()[0],
                     )
                 noise_pred = self.runners["pipe"].ctx.modules.compiled_mmdit[
                     "run_forward"
@@ -659,10 +667,14 @@ def generate_images(
                         step_index,
                     )
                 else:
-                    noise_pred = torch.tensor(noise_pred.to_host(), dtype=self.torch_dtype)
+                    noise_pred = torch.tensor(
+                        noise_pred.to_host(), dtype=self.torch_dtype
+                    )
                     if self.do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                        noise_pred = noise_pred_uncond + guidance_scale * (
+                            noise_pred_text - noise_pred_uncond
+                        )
                     latents = self.runners["scheduler"].step(
                         noise_pred,
                         t,
@@ -676,7 +688,9 @@ def generate_images(
                     latents,
                 )
             else:
-                vae_numpy_dtype = np.float32 if self.vae_precision == "fp32" else np.float16
+                vae_numpy_dtype = (
+                    np.float32 if self.vae_precision == "fp32" else np.float16
+                )
                 latents = latents.astype(vae_numpy_dtype)
 
             vae_start = time.time()
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index 26dbfb8f3..ea0213486 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -149,6 +149,7 @@ def step(self, noise_pred, t, latents, guidance_scale, i):
             return_dict=False,
         )[0]
 
+
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 # Only used for cpu scheduling.
 def retrieve_timesteps(
@@ -160,9 +161,13 @@ def retrieve_timesteps(
     **kwargs,
 ):
     if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
         if not accepts_timesteps:
             raise ValueError(
                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
@@ -172,7 +177,9 @@ def retrieve_timesteps(
         timesteps = scheduler.timesteps
         num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
         if not accept_sigmas:
             raise ValueError(
                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
@@ -186,6 +193,7 @@ def retrieve_timesteps(
         timesteps = scheduler.timesteps
     return timesteps, num_inference_steps
 
+
 @torch.no_grad()
 def export_scheduler_model(
     hf_model_name: str,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
index bebbee499..2e0a69445 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -131,7 +131,8 @@ def export_text_encoders(
 ):
 
     safe_name = utils.create_safe_name(
-        hf_model_name, f"_bs{output_batchsize}_{str(max_length)}_{precision}_text_encoders-{device}"
+        hf_model_name,
+        f"_bs{output_batchsize}_{str(max_length)}_{precision}_text_encoders-{device}",
     )
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
index 9cb435bde..1267bb862 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
@@ -32,9 +32,10 @@ def run_torch_vae(hf_model_name, variant, example_input):
     elif variant == "encode":
         results = vae_model.encode(example_input)
     np_torch_output = results.detach().cpu().numpy()
-    np_torch_output = imagearray_from_vae_out(np_torch_output)  
+    np_torch_output = imagearray_from_vae_out(np_torch_output)
     return np_torch_output
 
+
 def imagearray_from_vae_out(image):
     if image.ndim == 4:
         image = image[0]
@@ -42,6 +43,7 @@ def imagearray_from_vae_out(image):
     image = (image * 255).round().astype("uint8")
     return image
 
+
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
     import numpy as np
@@ -89,6 +91,4 @@ def imagearray_from_vae_out(image):
             out_image_turbine.save("vae_test_output_turbine.png")
         # Allow a small amount of wiggle room for rounding errors (1)
 
-        np.testing.assert_allclose(
-            turbine_results, torch_output, rtol=1, atol=1
-        )
+        np.testing.assert_allclose(turbine_results, torch_output, rtol=1, atol=1)
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index d1e4b0028..7b2248152 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -224,13 +224,12 @@ def export_scheduler_model(
         precision,
         str(num_inference_steps),
         target_triple,
-    ]  
+    ]
     vmfb_name = "_".join(vmfb_names)
     safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)
 
-
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 0931a4028..84700185c 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -349,11 +349,11 @@ def get_schedulers(model_id):
         model_id,
         subfolder="scheduler",
     )
-    schedulers[
-        "EulerAncestralDiscrete"
-    ] = EulerAncestralDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
+    schedulers["EulerAncestralDiscrete"] = (
+        EulerAncestralDiscreteScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+        )
     )
     # schedulers["DPMSolverSDE"] = DPMSolverSDEScheduler.from_pretrained(
     #     model_id,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 2bbff8a33..71e5730b4 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -174,7 +174,13 @@ def is_prepared(self, vmfbs, weights):
         dims = f"{str(self.width)}x{str(self.height)}"
         for key in vmfbs:
             if key == "scheduled_unet":
-                keywords = ["unet", self.scheduler_id, self.num_inference_steps, self.precision, dims]
+                keywords = [
+                    "unet",
+                    self.scheduler_id,
+                    self.num_inference_steps,
+                    self.precision,
+                    dims,
+                ]
                 device_key = "unet"
             elif key == "scheduler":
                 continue
@@ -189,7 +195,9 @@ def is_prepared(self, vmfbs, weights):
                 device_key = key
             avail_files = os.listdir(self.pipeline_dir)
             keywords.append("vmfb")
-            keywords.append(utils.create_safe_name(self.hf_model_name.split("/")[-1], ""))
+            keywords.append(
+                utils.create_safe_name(self.hf_model_name.split("/")[-1], "")
+            )
             keywords.append(self.devices[device_key]["target"])
             for filename in avail_files:
                 if all(str(x) in filename for x in keywords):
@@ -285,13 +293,16 @@ def export_submodel(
             if not os.path.exists(self.external_weights_dir):
                 os.makedirs(self.external_weights_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.external_weights_dir, f"vae_decode_{self.vae_precision}." + self.external_weights
+                self.external_weights_dir,
+                f"vae_decode_{self.vae_precision}." + self.external_weights,
             )
             unet_external_weight_path = os.path.join(
-                self.external_weights_dir, f"unet_{self.precision}." + self.external_weights
+                self.external_weights_dir,
+                f"unet_{self.precision}." + self.external_weights,
             )
             prompt_encoder_external_weight_path = os.path.join(
-                self.external_weights_dir, f"prompt_encoder_{self.precision}." + self.external_weights
+                self.external_weights_dir,
+                f"prompt_encoder_{self.precision}." + self.external_weights,
             )
         elif self.external_weights is None:
             print(
@@ -307,13 +318,15 @@ def export_submodel(
             if not os.path.exists(self.pipeline_dir):
                 os.makedirs(self.pipeline_dir, exist_ok=True)
             vae_external_weight_path = os.path.join(
-                self.pipeline_dir, f"vae_decode_{self.vae_precision}." + self.external_weights
+                self.pipeline_dir,
+                f"vae_decode_{self.vae_precision}." + self.external_weights,
             )
             unet_external_weight_path = os.path.join(
                 self.pipeline_dir, f"unet_{self.precision}." + self.external_weights
             )
             prompt_encoder_external_weight_path = os.path.join(
-                self.pipeline_dir, f"prompt_encoder_{self.precision}." + self.external_weights
+                self.pipeline_dir,
+                f"prompt_encoder_{self.precision}." + self.external_weights,
             )
         if weights_only:
             input_mlir = {
@@ -617,17 +630,24 @@ def generate_images(
         scheduler_id: str = "EulerDiscrete",
         progress=None,
     ):
-        needs_new_scheduler = (steps and steps != self.num_inference_steps) or cpu_scheduling != self.cpu_scheduling
+        needs_new_scheduler = (
+            steps and steps != self.num_inference_steps
+        ) or cpu_scheduling != self.cpu_scheduling
         self.cpu_scheduling = cpu_scheduling
         if steps and not self.compiled_pipeline and needs_new_scheduler:
             self.num_inference_steps = steps
-        if steps and not self.cpu_scheduling and not self.compiled_pipeline and needs_new_scheduler:
+        if (
+            steps
+            and not self.cpu_scheduling
+            and not self.compiled_pipeline
+            and needs_new_scheduler
+        ):
             self.runners["scheduler"] = None
             self.num_inference_steps = steps
             self.scheduler_id = scheduler_id
             scheduler_path = f"{scheduler_id}Scheduler_{self.num_inference_steps}"
             if not os.path.exists(scheduler_path):
-               scheduler_path, _ = self.export_submodel("scheduler") 
+                scheduler_path, _ = self.export_submodel("scheduler")
             try:
                 self.runners["scheduler"] = schedulers.SharkSchedulerWrapper(
                     self.devices["unet"]["driver"],
@@ -637,9 +657,7 @@ def generate_images(
                 print("JIT export of scheduler failed. Loading CPU scheduler.")
                 self.cpu_scheduling = True
         if self.cpu_scheduling and needs_new_scheduler:
-            scheduler = schedulers.get_scheduler(
-                self.hf_model_name, scheduler_id
-            )
+            scheduler = schedulers.get_scheduler(self.hf_model_name, scheduler_id)
             self.runners["scheduler"] = schedulers.SharkSchedulerCPUWrapper(
                 scheduler,
                 self.batch_size,
@@ -847,11 +865,11 @@ def generate_images(
                         dtype=self.vae_dtype,
                     )
                 vae_start = time.time()
-                #print(latents.to_host()[0,0,:])
+                # print(latents.to_host()[0,0,:])
                 vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
                     latents
                 )
-                #print(vae_out.to_host()[0,0,:])
+                # print(vae_out.to_host()[0,0,:])
 
                 pipe_end = time.time()
 
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index f4174ca2a..ecfc4baf6 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -168,7 +168,8 @@ def export_prompt_encoder(
         do_classifier_free_guidance = True
 
     safe_name = utils.create_safe_name(
-        hf_model_name, f"_bs{output_batchsize}_{str(max_length)}-{precision}-prompt-encoder-{device}"
+        hf_model_name,
+        f"_bs{output_batchsize}_{str(max_length)}-{precision}-prompt-encoder-{device}",
     )
     if pipeline_dir not in [None, ""]:
         safe_name = os.path.join(pipeline_dir, safe_name)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index 6ec6d11a5..b8bffe768 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -177,9 +177,7 @@ def export_scheduled_unet_model(
         f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_scheduled_unet_{str(num_inference_steps)}",
     )
     if pipeline_dir:
-        safe_name = os.path.join(
-            pipeline_dir, safe_name
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
 
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
diff --git a/models/turbine_models/tests/sd3_test.py b/models/turbine_models/tests/sd3_test.py
index 681cef403..b1fc664ac 100644
--- a/models/turbine_models/tests/sd3_test.py
+++ b/models/turbine_models/tests/sd3_test.py
@@ -57,7 +57,9 @@ def command_line_args(request):
         "--external_weight_path"
     )
     arguments["external_weight_dir"] = request.config.getoption("--external_weight_dir")
-    arguments["external_weight_file"] = request.config.getoption("--external_weight_file")
+    arguments["external_weight_file"] = request.config.getoption(
+        "--external_weight_file"
+    )
     arguments["vmfb_path"] = request.config.getoption("--vmfb_path")
     arguments["pipeline_vmfb_path"] = request.config.getoption("--pipeline_vmfb_path")
     arguments["scheduler_vmfb_path"] = request.config.getoption("--scheduler_vmfb_path")
@@ -96,6 +98,7 @@ def command_line_args(request):
     arguments["vae_flags"] = request.config.getoption("--vae_flags")
     arguments["mmdit_flags"] = request.config.getoption("--mmdit_flags")
 
+
 @pytest.mark.usefixtures("command_line_args")
 class StableDiffusion3Test(unittest.TestCase):
     def setUp(self):
@@ -116,11 +119,12 @@ def setUp(self):
 
     def test01_ExportPromptEncoder(self):
         if arguments["device"] in ["vulkan", "cuda"]:
-            self.skipTest(
-                "Not testing sd3 on vk or cuda"
-            )
+            self.skipTest("Not testing sd3 on vk or cuda")
         arguments["external_weight_path"] = (
-            arguments["external_weight_path"] + "/sd3_text_encoders_"+arguments["precision"]+ ".irpa" 
+            arguments["external_weight_path"]
+            + "/sd3_text_encoders_"
+            + arguments["precision"]
+            + ".irpa"
         )
         _, prompt_encoder_vmfb = sd3_text_encoders.export_text_encoders(
             arguments["hf_model_name"],
@@ -216,8 +220,14 @@ def test02_ExportMMDITModel(self):
         dtype = torch.float16 if arguments["precision"] == "fp16" else torch.float32
 
         hidden_states = torch.randn(
-        (arguments["batch_size"], 16, arguments["height"] // 8, arguments["width"] // 8), dtype=dtype
-    )
+            (
+                arguments["batch_size"],
+                16,
+                arguments["height"] // 8,
+                arguments["width"] // 8,
+            ),
+            dtype=dtype,
+        )
         encoder_hidden_states = torch.randn(
             (arguments["batch_size"], arguments["max_length"] * 2, 4096), dtype=dtype
         )
@@ -237,20 +247,20 @@ def test02_ExportMMDITModel(self):
             timestep,
             arguments,
         )
-#        if arguments["benchmark"] or arguments["tracy_profile"]:
-#            run_benchmark(
-#                "unet",
-#                arguments["vmfb_path"],
-#                arguments["external_weight_path"],
-#                arguments["rt_device"],
-#                max_length=arguments["max_length"],
-#                height=arguments["height"],
-#                width=arguments["width"],
-#                batch_size=arguments["batch_size"],
-#                in_channels=arguments["in_channels"],
-#                precision=arguments["precision"],
-#                tracy_profile=arguments["tracy_profile"],
-#            )
+        #        if arguments["benchmark"] or arguments["tracy_profile"]:
+        #            run_benchmark(
+        #                "unet",
+        #                arguments["vmfb_path"],
+        #                arguments["external_weight_path"],
+        #                arguments["rt_device"],
+        #                max_length=arguments["max_length"],
+        #                height=arguments["height"],
+        #                width=arguments["width"],
+        #                batch_size=arguments["batch_size"],
+        #                in_channels=arguments["in_channels"],
+        #                precision=arguments["precision"],
+        #                tracy_profile=arguments["tracy_profile"],
+        #            )
         rtol = 4e-2
         atol = 4e-1
 
@@ -264,7 +274,7 @@ def test03_ExportVaeModelDecode(self):
             # This is a public model, so no auth required
             exit_on_vmfb=True,
         )
-    
+
         arguments["external_weight_path"] = (
             self.safe_model_name
             + "_"
@@ -328,7 +338,7 @@ def test03_ExportVaeModelDecode(self):
             "decode",
             example_input_torch,
         )
-        #if arguments["benchmark"] or arguments["tracy_profile"]:
+        # if arguments["benchmark"] or arguments["tracy_profile"]:
         #    run_benchmark(
         #        "vae_decode",
         #        arguments["vmfb_path"],
@@ -344,6 +354,7 @@ def test03_ExportVaeModelDecode(self):
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
+
 #    def test04_ExportVaeModelEncode(self):
 #        if arguments["device"] in ["cpu", "vulkan", "cuda", "rocm"]:
 #            self.skipTest(

From 7388e14fe838c81616ca45585adae46e3d02dd7a Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 20 Jun 2024 16:22:34 -0500
Subject: [PATCH 141/174] SDXL: fix scheduled unet modes

---
 .../custom_models/sd_inference/schedulers.py  |   1 -
 .../custom_models/sd_inference/utils.py       |  20 +++-
 .../sdxl_inference/sdxl_cmd_opts.py           |   2 +-
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 112 ++++++++++--------
 .../sdxl_inference/sdxl_scheduled_unet.py     |  12 +-
 5 files changed, 91 insertions(+), 56 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 7b2248152..2c8d618c6 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -223,7 +223,6 @@ def export_scheduler_model(
         f"{height}x{width}",
         precision,
         str(num_inference_steps),
-        target_triple,
     ]
     vmfb_name = "_".join(vmfb_names)
     safe_name = utils.create_safe_name(hf_model_name, "_" + vmfb_name)
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 84700185c..8822d0144 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -240,6 +240,16 @@ def compile_to_vmfb(
             flags.pop(idx)
     print("Compiling to", device, "with flags:", flags)
 
+    # Forces a standard for naming files:
+    # If safe_name has target triple in it, get rid of target triple in mlir name
+    #
+    if target_triple not in safe_name:
+        safe_vmfb_name = safe_name + "_" + target_triple
+        safe_mlir_name = safe_name
+    else:
+        safe_vmfb_name = safe_name
+        safe_mlir_name = "".join(safe_name.split(target_triple))
+
     if mlir_source == "file":
         flatbuffer_blob = ireec.compile_file(
             module_str,
@@ -249,9 +259,9 @@ def compile_to_vmfb(
         )
     elif mlir_source == "str":
         if save_mlir:
-            with open(f"{safe_name}.mlir", "w+") as f:
+            with open(f"{safe_mlir_name}.mlir", "w+") as f:
                 f.write(module_str)
-            print("Saved to", safe_name + ".mlir")
+            print("Saved to", safe_mlir_name + ".mlir")
         flatbuffer_blob = ireec.compile_str(
             module_str,
             target_backends=[device],
@@ -260,11 +270,11 @@ def compile_to_vmfb(
         )
     else:
         raise ValueError("mlir_source must be either 'file' or 'str'")
-    with open(f"{safe_name}.vmfb", "wb+") as f:
+    with open(f"{safe_vmfb_name}.vmfb", "wb+") as f:
         f.write(flatbuffer_blob)
-    print("Saved to", safe_name + ".vmfb")
+    print(f"Saved to {safe_vmfb_name}.vmfb")
     if return_path == True:
-        return safe_name + ".vmfb"
+        return safe_vmfb_name + ".vmfb"
 
 
 def create_safe_name(hf_model_name, model_name_str):
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
index 5d5bde32f..c1c21301b 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
@@ -125,7 +125,7 @@ def is_valid_file(arg):
 
 p.add_argument(
     "--split_scheduler",
-    default=True,
+    default=False,
     action="store_true",
     help="Use a decoupled unet and scheduler for better QOL.",
 )
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 71e5730b4..8d6ce4ed1 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -119,6 +119,7 @@ def __init__(
         self.custom_vae = custom_vae
         self.cpu_scheduling = cpu_scheduling
         self.compiled_pipeline = False
+        self.split_scheduler = False
         # TODO: set this based on user-inputted guidance scale and negative prompt.
         self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
         self._interrupt = False
@@ -166,7 +167,7 @@ def check_prepared(
                     print("There was an error generating the necessary files.")
                     exit()
         else:
-            print("All necessary files found. Loading pipeline.")
+            print("All necessary files found.")
         return vmfbs, weights
 
     def is_prepared(self, vmfbs, weights):
@@ -175,10 +176,11 @@ def is_prepared(self, vmfbs, weights):
         for key in vmfbs:
             if key == "scheduled_unet":
                 keywords = [
-                    "unet",
+                    "DiffusionModule",
                     self.scheduler_id,
-                    self.num_inference_steps,
+                    str(self.num_inference_steps),
                     self.precision,
+                    self.max_length,
                     dims,
                 ]
                 device_key = "unet"
@@ -192,38 +194,44 @@ def is_prepared(self, vmfbs, weights):
                 device_key = "clip"
             else:
                 keywords = [key, self.precision, self.max_length, dims]
-                device_key = key
-            avail_files = os.listdir(self.pipeline_dir)
-            keywords.append("vmfb")
-            keywords.append(
-                utils.create_safe_name(self.hf_model_name.split("/")[-1], "")
+                device_key = "unet"
+            keywords.extend(
+                [
+                    utils.create_safe_name(self.hf_model_name.split("/")[-1], ""),
+                    "vmfb",
+                    "bs" + str(self.batch_size),
+                    self.devices[device_key]["target"],
+                ]
             )
-            keywords.append(self.devices[device_key]["target"])
+            avail_files = os.listdir(self.pipeline_dir)
             for filename in avail_files:
                 if all(str(x) in filename for x in keywords):
                     vmfbs[key] = os.path.join(self.pipeline_dir, filename)
             if not vmfbs[key]:
                 missing.append(key + " vmfb")
+
         for w_key in weights:
-            if any(x in w_key for x in ["pipeline", "scheduler"]):
-                continue
-            if weights[w_key] is not None:
-                continue
-            if self.external_weights is None:
-                continue
-            default_name = os.path.join(
-                self.external_weights_dir, w_key + "." + self.external_weights
-            )
-            if weights[w_key] is None and os.path.exists(default_name):
-                weights[w_key] = os.path.join(default_name)
-            elif w_key in ["scheduled_unet"] and os.path.exists(
-                os.path.join(self.external_weights_dir, "unet." + self.external_weights)
+            if any(x in w_key for x in ["pipeline", "scheduler"]) or (
+                self.external_weights is None
             ):
-                weights[w_key] = os.path.join(
-                    self.external_weights_dir, "unet." + self.external_weights
+                continue
+            elif weights[w_key] is not None:
+                print("Weights already found for ", w_key, "at: ", weights[w_key])
+            elif w_key == "vae_decode":
+                keywords = ["vae", self.vae_precision]
+            elif w_key in ["prompt_encoder", "clip"]:
+                keywords = ["prompt_encoder", self.precision]
+            elif w_key in ["scheduled_unet", "unet"]:
+                keywords = ["unet", self.precision]
+            avail_weights = os.listdir(self.external_weights_dir)
+            for filename in avail_weights:
+                if all(str(x) in filename for x in keywords):
+                    weights[w_key] = os.path.join(self.external_weights_dir, filename)
+            if not weights[w_key]:
+                missing.append(
+                    " ".join([keywords[0], keywords[1], self.external_weights])
                 )
-            else:
-                missing.append(w_key + "." + self.external_weights)
+
         if len(missing) > 0:
             print(f"Missing files: " + ", ".join(missing))
             return False, vmfbs, weights
@@ -476,12 +484,20 @@ def export_submodel(
                     self.max_length,
                     "unet_loop",
                 )
+                pipeline_keys = [
+                    utils.create_safe_name(self.hf_model_name.split("/")[-1], ""),
+                    "bs" + str(self.batch_size),
+                    f"{str(self.width)}x{str(self.height)}",
+                    self.precision,
+                    str(self.max_length),
+                    "pipeline",
+                ]
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
                     self.devices["unet"]["device"],
                     self.devices["unet"]["target"],
                     self.ireec_flags["pipeline"],
-                    os.path.join(self.pipeline_dir, "pipeline"),
+                    os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
                     return_path=True,
                     mlir_source="str",
                 )
@@ -495,12 +511,20 @@ def export_submodel(
                     self.max_length,
                     "tokens_to_image",
                 )
+                pipeline_keys = [
+                    utils.create_safe_name(self.hf_model_name.split("/")[-1], ""),
+                    "bs" + str(self.batch_size),
+                    f"{str(self.width)}x{str(self.height)}",
+                    self.precision,
+                    str(self.max_length),
+                    "full_pipeline",
+                ]
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
                     self.devices["unet"]["device"],
                     self.devices["unet"]["target"],
                     self.ireec_flags["pipeline"],
-                    os.path.join(self.pipeline_dir, "full_pipeline"),
+                    os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
                     return_path=True,
                     mlir_source="str",
                 )
@@ -631,9 +655,13 @@ def generate_images(
         progress=None,
     ):
         needs_new_scheduler = (
-            steps and steps != self.num_inference_steps
-        ) or cpu_scheduling != self.cpu_scheduling
+            (steps and steps != self.num_inference_steps)
+            or (cpu_scheduling != self.cpu_scheduling)
+            and self.split_scheduler
+        )
+
         self.cpu_scheduling = cpu_scheduling
+
         if steps and not self.compiled_pipeline and needs_new_scheduler:
             self.num_inference_steps = steps
         if (
@@ -953,13 +981,10 @@ def numpy_to_pil_image(images):
 
     map = empty_pipe_dict
     if args.split_scheduler:
-        map["scheduler"] = None
         map["unet"] = None
         map.pop("scheduled_unet")
         map.pop("pipeline")
         map.pop("full_pipeline")
-        if args.cpu_scheduling:
-            map.pop("scheduler")
     mlirs = copy.deepcopy(map)
     vmfbs = copy.deepcopy(map)
     weights = copy.deepcopy(map)
@@ -1002,20 +1027,12 @@ def numpy_to_pil_image(images):
         "scheduler": args.ireec_flags,
     }
     if not args.pipeline_dir:
-        pipe_id_list = [
-            args.hf_model_name.split("/")[-1],
-            str(args.height),
-            str(args.width),
-            str(args.max_length),
-            args.precision,
-            args.device,
-        ]
-        if args.decomp_attn:
-            pipe_id_list.append("decomp")
         args.pipeline_dir = os.path.join(
             ".",
-            "_".join(pipe_id_list),
+            utils.create_safe_name(args.hf_model_name, ""),
         )
+        if not os.path.exists(args.pipeline_dir):
+            os.makedirs(args.pipeline_dir, exist_ok=True)
     if args.input_mlir:
         user_mlir_list = args.input_mlir.split(",")
     else:
@@ -1027,14 +1044,15 @@ def numpy_to_pil_image(images):
         args.external_weights_dir = args.pipeline_dir
     sdxl_pipe = SharkSDXLPipeline(
         args.hf_model_name,
-        args.scheduler_id,
         args.height,
         args.width,
         args.precision,
         args.max_length,
         args.batch_size,
+        args.num_inference_steps,
         devices,
         targets,
+        args.scheduler_id,
         ireec_flags,
         args.attn_spec,
         args.decomp_attn,
@@ -1045,9 +1063,9 @@ def numpy_to_pil_image(images):
         custom_vae=None,
         vae_precision=args.vae_precision,
     )
+
     vmfbs, weights = sdxl_pipe.check_prepared(mlirs, vmfbs, weights)
-    if args.cpu_scheduling:
-        vmfbs["scheduler"] = None
+
     if args.npu_delegate_path:
         extra_device_args = {"npu_delegate_path": args.npu_delegate_path}
     else:
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
index b8bffe768..fd9adaa8f 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_scheduled_unet.py
@@ -171,10 +171,18 @@ def export_scheduled_unet_model(
     # else:
     #     do_classifier_free_guidance = True
     do_classifier_free_guidance = True
-
+    filename_keys = [
+        f"bs{batch_size}",
+        str(max_length),
+        f"{height}x{width}",
+        precision,
+        scheduler_id,
+        "DiffusionModule",
+        str(num_inference_steps),
+    ]
     safe_name = utils.create_safe_name(
         hf_model_name,
-        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_scheduled_unet_{str(num_inference_steps)}",
+        "_".join(filename_keys),
     )
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)

From fd2185bcf2c43f02bb5baca5f05f2d357aa67491 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 20 Jun 2024 21:38:53 -0500
Subject: [PATCH 142/174] Make pipeline mode names  mutually exclusive and
 fixes to weights loading

---
 .../sd3_inference/sd3_pipeline.py             |  3 +-
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 39 ++++++++++++++-----
 .../custom_models/sdxl_inference/vae.py       | 24 ++++++------
 3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index ce57dbe15..256e4d21b 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -65,6 +65,7 @@ def __init__(
         vae_precision: str = "fp32",
         scheduler_id: str = None,  # compatibility only, always uses EulerFlowScheduler
         shift: float = 1.0,
+        custom_vae: str = None,
     ):
         self.hf_model_name = hf_model_name
         # self.scheduler_id = scheduler_id
@@ -122,7 +123,7 @@ def __init__(
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
-        self.custom_vae = None
+        self.custom_vae = custom_vae
         self.cpu_scheduling = cpu_scheduling
         self.torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
         self.vae_precision = vae_precision if vae_precision else self.precision
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 8d6ce4ed1..260fd9f3d 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -32,8 +32,8 @@
     "vae_decode": None,
     "prompt_encoder": None,
     "scheduled_unet": None,
-    "pipeline": None,
-    "full_pipeline": None,
+    "unetloop": None,
+    "fullpipeline": None,
 }
 
 EMPTY_FLAGS = {
@@ -117,6 +117,12 @@ def __init__(
         self.vae_precision = vae_precision
         self.vae_dtype = "float32" if vae_precision == "fp32" else "float16"
         self.custom_vae = custom_vae
+        if self.custom_vae:
+            self.vae_dir = os.path.join(
+                self.pipeline_dir, utils.create_safe_name(custom_vae, "")
+            )
+            if not os.path.exists(self.vae_dir):
+                os.makedirs(self.vae_dir)
         self.cpu_scheduling = cpu_scheduling
         self.compiled_pipeline = False
         self.split_scheduler = False
@@ -173,6 +179,7 @@ def check_prepared(
     def is_prepared(self, vmfbs, weights):
         missing = []
         dims = f"{str(self.width)}x{str(self.height)}"
+        pipeline_dir = self.pipeline_dir
         for key in vmfbs:
             if key == "scheduled_unet":
                 keywords = [
@@ -189,6 +196,8 @@ def is_prepared(self, vmfbs, weights):
             elif key == "vae_decode":
                 keywords = ["vae", self.vae_precision, dims]
                 device_key = "vae"
+                if self.custom_vae:
+                    pipeline_dir = self.vae_dir
             elif key == "prompt_encoder":
                 keywords = ["prompt_encoder", self.precision, self.max_length]
                 device_key = "clip"
@@ -203,10 +212,10 @@ def is_prepared(self, vmfbs, weights):
                     self.devices[device_key]["target"],
                 ]
             )
-            avail_files = os.listdir(self.pipeline_dir)
+            avail_files = os.listdir(pipeline_dir)
             for filename in avail_files:
                 if all(str(x) in filename for x in keywords):
-                    vmfbs[key] = os.path.join(self.pipeline_dir, filename)
+                    vmfbs[key] = os.path.join(pipeline_dir, filename)
             if not vmfbs[key]:
                 missing.append(key + " vmfb")
 
@@ -432,6 +441,14 @@ def export_submodel(
                     vae_torch = self.get_torch_models("vae_decode")
                 else:
                     vae_torch = None
+                if self.custom_vae:
+                    vae_external_weight_path = os.path.join(
+                        self.vae_dir,
+                        f"vae_decode_{self.vae_precision}." + self.external_weights,
+                    )
+                    vae_dir = self.vae_dir
+                else:
+                    vae_dir = self.pipeline_dir
                 vae_decode_vmfb = vae.export_vae_model(
                     vae_torch,
                     self.hf_model_name,
@@ -448,7 +465,7 @@ def export_submodel(
                     "decode",
                     self.vae_decomp_attn,
                     exit_on_vmfb=False,
-                    pipeline_dir=self.pipeline_dir,
+                    pipeline_dir=vae_dir,
                     attn_spec=self.attn_spec,
                     input_mlir=input_mlir["vae_decode"],
                     weights_only=weights_only,
@@ -468,14 +485,16 @@ def export_submodel(
                     self.devices["clip"]["target"],
                     self.ireec_flags["clip"],
                     exit_on_vmfb=False,
-                    pipeline_dir=self.pipeline_dir,
+                    pipeline_dir=(
+                        self.pipeline_dir if not self.custom_vae else self.vae_dir
+                    ),
                     input_mlir=input_mlir["prompt_encoder"],
                     attn_spec=self.attn_spec,
                     weights_only=weights_only,
                     output_batchsize=self.batch_size,
                 )
                 return prompt_encoder_vmfb, prompt_encoder_external_weight_path
-            case "pipeline":
+            case "unetloop":
                 pipeline_file = get_pipeline_ir(
                     self.width,
                     self.height,
@@ -490,7 +509,7 @@ def export_submodel(
                     f"{str(self.width)}x{str(self.height)}",
                     self.precision,
                     str(self.max_length),
-                    "pipeline",
+                    "unetloop",
                 ]
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
@@ -502,7 +521,7 @@ def export_submodel(
                     mlir_source="str",
                 )
                 return pipeline_vmfb, None
-            case "full_pipeline":
+            case "fullpipeline":
                 pipeline_file = get_pipeline_ir(
                     self.width,
                     self.height,
@@ -517,7 +536,7 @@ def export_submodel(
                     f"{str(self.width)}x{str(self.height)}",
                     self.precision,
                     str(self.max_length),
-                    "full_pipeline",
+                    "fullpipeline",
                 ]
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py
index ed474256e..753cbb9e7 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae.py
@@ -19,6 +19,7 @@
 import torch
 import torch._dynamo as dynamo
 from diffusers import AutoencoderKL
+import safetensors
 
 
 class VaeModel(torch.nn.Module):
@@ -34,6 +35,14 @@ def __init__(
                 hf_model_name,
                 subfolder="vae",
             )
+        elif "safetensors" in custom_vae:
+            custom_vae = safetensors.torch.load_file(custom_vae)
+            # custom vae as a HF state dict
+            self.vae = AutoencoderKL.from_pretrained(
+                hf_model_name,
+                subfolder="vae",
+            )
+            self.vae.load_state_dict(custom_vae)
         elif not isinstance(custom_vae, dict):
             try:
                 # custom HF repo with no vae subfolder
@@ -46,13 +55,6 @@ def __init__(
                     custom_vae,
                     subfolder="vae",
                 )
-        else:
-            # custom vae as a HF state dict
-            self.vae = AutoencoderKL.from_pretrained(
-                hf_model_name,
-                subfolder="vae",
-            )
-            self.vae.load_state_dict(custom_vae)
 
     def decode(self, inp):
         img = 1 / 0.13025 * inp
@@ -104,10 +106,10 @@ def export_vae_model(
             attn_spec=attn_spec,
         )
         return vmfb_path
-    if precision == "fp32" and device == "rocm":
-        decomp_attn = True
-        external_weights = None
-        print("Decomposing attention and inlining weights for fp32 VAE on ROCm")
+    # if precision == "fp32" and device == "rocm":
+    #     decomp_attn = True
+    #     external_weights = None
+    #     print("Decomposing attention and inlining weights for fp32 VAE on ROCm")
     if device == "cpu":
         decomp_attn = True
 

From 37860614d286e78aa5bd9f8d53252e3dca6607b9 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Thu, 20 Jun 2024 21:41:17 -0500
Subject: [PATCH 143/174] Add new keys to weights skip

---
 .../custom_models/sdxl_inference/sdxl_compiled_pipeline.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 260fd9f3d..5a6d66d77 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -220,7 +220,7 @@ def is_prepared(self, vmfbs, weights):
                 missing.append(key + " vmfb")
 
         for w_key in weights:
-            if any(x in w_key for x in ["pipeline", "scheduler"]) or (
+            if any(x in w_key for x in ["fullpipeline", "unetloop", "scheduler"]) or (
                 self.external_weights is None
             ):
                 continue

From 37c3368937a8a79a0d9480376eb6c671aa5a21c2 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Fri, 21 Jun 2024 13:38:31 -0500
Subject: [PATCH 144/174] xfail llama test.

---
 models/turbine_models/tests/stateless_llama_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py
index 884caa575..4b1ffef73 100644
--- a/models/turbine_models/tests/stateless_llama_test.py
+++ b/models/turbine_models/tests/stateless_llama_test.py
@@ -139,6 +139,9 @@ def test_vmfb_comparison(self):
             new_blob_name = new_blob_name[0] + "-pass.mlir"
             turbine_tank.changeBlobName(blob_name, new_blob_name)
 
+    # See: https://github.com/nod-ai/SHARK-Turbine/issues/601
+    # Developed issues related to the pytorch 2.3 upgrade.
+    @unittest.expectedFailure
     def test_streaming_vmfb_comparison(self):
         """
         Similar test to above but for streaming-LLM.

From 5846d100223cd3ebc5446d8dc9edfe655e51605e Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Sat, 22 Jun 2024 19:12:44 -0500
Subject: [PATCH 145/174] Add a base class for turbine-models pipelines.

---
 .../custom_models/pipeline_base.py            | 500 ++++++++++++++++++
 .../sd3_inference/sd3_pipeline.py             |   2 +-
 2 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 models/turbine_models/custom_models/pipeline_base.py

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
new file mode 100644
index 000000000..990165bad
--- /dev/null
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -0,0 +1,500 @@
+# Copyright 2024 Advanced Micro Devices, inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import torch
+
+import iree.runtime as ireert
+from turbine_models.custom_models.sd_inference import utils, schedulers
+from turbine_models.custom_models.sdxl_inference.pipeline_ir import (
+    get_pipeline_ir,
+)
+from turbine_models.utils.sdxl_benchmark import run_benchmark
+from turbine_models.model_runner import vmfbRunner
+
+from PIL import Image
+import gc
+import os
+import numpy as np
+import time
+import copy
+from datetime import datetime as dt
+
+
+def merge_arg_into_map(model_map, arg, arg_name):
+    if isinstance(arg, dict):
+        for key in arg.keys():
+            model_map[key][arg_name] = arg[key]
+    else:
+        for key in model_map.keys():
+            model_map[key][arg_name] = arg
+    return model_map
+
+
+class PipelineComponent:
+    """
+    Wraps a VMFB runner with attributes for embedded metadata, device info, utilities and
+    has methods for handling I/O or otherwise assisting in interfacing with their pipeline
+    and its other components.
+    This aims to make new pipelines and execution modes easier to write, manage, and debug.
+    """
+
+    def __init__(self, dest_type=ireert.DeviceArray, dest_dtype="float16"):
+        self.runner = None
+        self.module_name = None
+        self.device = None
+        self.metadata = None
+        self.benchmark = False
+        self.output_type = dest_type
+        self.output_dtype = dest_dtype
+
+    def load(
+        self,
+        rt_device: str,
+        vmfb_path: str,
+        module_name: str,
+        external_weight_path: str = None,
+        extra_plugin=None,
+    ):
+        self.module_name = module_name
+        self.runner = vmfbRunner(
+            rt_device, vmfb_path, external_weight_path, extra_plugin
+        )
+        self.device = self.runner.config.device
+        self.module = getattr(self.runner.ctx.modules, module_name)
+        if "get_metadata" in self.module.keys():
+            self.metadata = self.module["get_metadata"]()
+
+    def unload(self):
+        self.device = None
+        self.runner = None
+        gc.collect()
+
+    def _run(self, function_name, inputs: list):
+        return self.module[function_name](*inputs)
+
+    def _run_and_benchmark(self, function_name, inputs: list):
+        start_time = time.time()
+        output = self._run(function_name, inputs)
+        latency = time.time() - start_time
+        print(f"Latency for {self.module_name}['{function_name}']: {latency}sec")
+        return output
+
+    def __call__(self, function_name, inputs: list):
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+        if self.benchmark:
+            output = self._run_and_benchmark(function_name, inputs)
+        else:
+            output = self._run(function_name, inputs)
+        if output.dtype() != self.output_dtype:
+            output = output.astype(self.output_dtype)
+
+        match self.output_type:
+            case ireert.DeviceArray:
+                return output
+            case torch.Tensor:
+                return torch.tensor(output.to_host())
+            case np.ndarray:
+                return output.to_host()
+
+
+class TurbinePipelineBase:
+    """
+    This class is a lightweight base for Stable Diffusion
+    inference API classes. It should provide methods for:
+
+    - Exporting and compiling a set (model map) of torch IR modules
+    - preparing weights for an inference job
+    - loading weights for an inference job
+    - utilities i.e. filenames, downloads
+
+    The general flow of an arbitrary child of this pipeline base is as follows:
+    1. Initialize a model map and class attributes.
+    2. Preparation: Check if all necessary files are present, and generate them if not. (prepare_all() / prepare_submodel())
+        - This is done by submodel, so that users can generate a new submodel with the same pipeline.
+        - If vmfb not found, first check turbine tank for matching .vmfb file.
+        - If vmfb not downloadable, try downloading .mlir.
+        - If neither on Azure, run the export function in model map to export to torch IR and compile with IREE.
+        - If weights not found, run the export function in model map with weights_only=True.
+            - Apps should populate the weights with custom weights by now so they can be managed and converted if needed here.
+    3. Load the pipeline: Load the prepared files onto devices as vmfbRunners. (load_pipeline() / load_submodel() / reload_submodel())
+    4. Run Inference:
+
+
+
+    Arguments:
+    model_map: dict
+        A dictionary mapping submodel names to their export functions and hf model ids. This is used throughout the pipeline.
+        It also should provide I/O information for the submodels.
+    height: int
+        The height of the image to be generated
+    width: int
+        The width of the image to be generated
+    precision: str
+        The precision of the image latents. This usually decides the precision of all models in the pipeline.
+    max_length: int
+        The maximum sequence length for text encoders and diffusion models.
+    batch_size: int
+        The number of images to generate from each inference batch. This changes the shapes in all submodels.
+    device: str | dict[str]
+        Either a string i.e. "rocm://0", or a dictionary of such with keys matching the submodels of a given pipeline.
+        If a string, a dictionary will be created based on the pipeline's model map and the same device will be used for all submodels.
+    iree_target_triple: str | dict[str]
+        Either a string i.e. "gfx1100", or a dictionary with keys matching the submodels of a given pipeline.
+    ireec_flags: str | dict[str]
+        A comma-separated string of flags to pass to the IREE compiler, or a dict of them with keys matching submodels of a given pipeline.
+    """
+
+    def __init__(
+        self,
+        model_map: dict,
+        batch_size: int,
+        device: str | dict[str],
+        iree_target_triple: str | dict[str],
+        ireec_flags: str | dict[str] = None,
+        precision: str | dict[str] = "fp16",
+        td_spec: str | dict[str] = None,
+        decomp_attn: bool | dict[bool] = False,
+        external_weights: str | dict[str] = "safetensors",
+        pipeline_dir: str = "./shark_vmfbs",
+        external_weights_dir: str = "./shark_weights",
+    ):
+        self.map = model_map
+        self.batch_size = batch_size
+        if isinstance(device, dict):
+            assert isinstance(
+                iree_target_triple, dict
+            ), "Device and target triple must be both dicts or both strings."
+            for submodel in self.map.keys():
+                assert submodel in device.keys(), f"Device for {submodel} not found."
+                assert (
+                    submodel in iree_target_triple.keys()
+                ), f"Target arch for {submodel} not found."
+                self.map[submodel]["device"] = (device[submodel],)
+                self.map[submodel]["driver"] = (
+                    utils.iree_device_map(device[submodel]),
+                )
+                self.map[submodel]["target"] = iree_target_triple[submodel]
+        else:
+            assert isinstance(
+                iree_target_triple, str
+            ), "Device and target triple must be both dicts or both strings."
+            for submodel in self.map.keys():
+                self.map[submodel]["device"] = (device[submodel],)
+                self.map[submodel]["driver"] = (
+                    utils.iree_device_map(device[submodel]),
+                )
+                self.map[submodel]["target"] = iree_target_triple[submodel]
+        map_arguments = {
+            "ireec_flags": ireec_flags,
+            "precision": precision,
+            "td_spec": td_spec,
+            "decomp_attn": decomp_attn,
+            "external_weights": external_weights,
+        }
+        for arg in map_arguments.keys():
+            self.map = merge_arg_into_map(self.map, map_arguments[arg], arg)
+        np_dtypes = {
+            "fp16": np.float16,
+            "fp32": np.float32,
+        }
+        torch_dtypes = {
+            "fp16": torch.float16,
+            "fp32": torch.float32,
+        }
+        for submodel in self.map.keys():
+            self.map = merge_arg_into_map(
+                self.map, np_dtypes[self.map[submodel]["precision"]], "np_dtype"
+            )
+            self.map = merge_arg_into_map(
+                self.map, torch_dtypes[self.map[submodel]["precision"]], "torch_dtype"
+            )
+        print(self.map)
+
+        self.pipeline_dir = pipeline_dir
+        self.external_weights_dir = external_weights_dir
+
+        # Disabled for now -- enable through option when turbine tank is ready.
+        self.download = False
+
+        # These arguments are set at run or load time.
+        self.compiled_pipeline = False
+        self.split_scheduler = False
+        self.cpu_scheduling = False
+
+        # TODO: set this based on user-inputted guidance scale and negative prompt.
+        self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
+        self._interrupt = False
+
+    # FILE MANAGEMENT AND PIPELINE SETUP
+
+    def prepare_all(
+        self,
+        mlirs: dict,
+        vmfbs: dict,
+        weights: dict,
+        interactive: bool = True,
+    ):
+        ready = self.is_prepared(vmfbs, weights)
+        match ready:
+            case True:
+                print("All necessary files found.")
+                return
+            case False:
+                if interactive:
+                    do_continue = input(
+                        f"\nIt seems you are missing some necessary files. Would you like to generate them now? (y/n)"
+                    )
+                    if do_continue.lower() != "y":
+                        exit()
+                for submodel in self.map.keys():
+                    if not self.map[submodel].get("vmfb"):
+                        print("Fetching: ", submodel)
+                        self.export_submodel(submodel, input_mlir=mlirs)
+                        if not self.map[submodel]["external_weights"]:
+                            assert not self.map[submodel].get(
+                                "weights"
+                            ), f"External weights should not be used for a model with inlined params."
+                return self.prepare_all(mlirs, vmfbs, weights, interactive)
+
+    def is_prepared(self, vmfbs, weights):
+        missing = {}
+        pipeline_dir = self.pipeline_dir
+        for key in self.map:
+            missing[key] = []
+            # vmfb is already present in model map
+            if self.map[key].get("vmfb"):
+                continue
+            # vmfb is passed in to this function
+            elif vmfbs.get(key):
+                self.map[key]["vmfb"] = vmfbs[key]
+                continue
+            # search self.pipeline_dir for key-specific vmfb
+            keywords = self.map[key].get("keywords", [])
+            keywords.extend(
+                [
+                    self.map[key]["safe_name"],
+                    "vmfb",
+                    "bs" + str(self.batch_size),
+                    self.map[key]["target"],
+                    self.map[key]["precision"],
+                ]
+            )
+            avail_files = os.listdir(pipeline_dir)
+            candidates = []
+            for filename in avail_files:
+                if all(str(x) in filename for x in keywords):
+                    candidates.append(os.path.join(pipeline_dir, filename))
+            if len(candidates) == 1:
+                self.map[key]["vmfb"] = candidates[0]
+            elif len(candidates) > 1:
+                print(f"Multiple files found for {key}: {candidates}")
+                print(f"Choosing {candidates[0]} for {key}.")
+                self.map[key]["vmfb"] = candidates[0]
+            else:
+                # vmfb not found in pipeline_dir. Add to list of files to generate.
+                missing[key].append("vmfb")
+
+            # Make sure vmfb needs external weights, as they may be inlined.
+            if self.map[key].get("external_weights"):
+                if self.map[key].get("weights"):
+                    # weights already found in model map
+                    continue
+                elif weights.get(key):
+                    # weights passed in to this function
+                    self.map[key]["weights"] = weights[key]
+                    continue
+                # search self.external_weights_dir for key-specific weights
+                w_keywords = [
+                    self.map[key]["safe_name"],
+                    self.map[key]["precision"],
+                    self.map[key]["external_weights"],
+                ]
+                avail_files = os.listdir(self.external_weights_dir)
+                candidates = []
+                for filename in avail_files:
+                    if all(str(x) in filename for x in w_keywords):
+                        candidates.append(
+                            os.path.join(self.external_weights_dir, filename)
+                        )
+                if len(candidates) == 1:
+                    self.map[key]["weights"] = candidates[0]
+                elif len(candidates) > 1:
+                    print(f"Multiple weight files found for {key}: {candidates}")
+                    print(f"Choosing {candidates[0]} for {key}.")
+                    self.map[key][weights] = candidates[0]
+                else:
+                    # weights not found in external_weights_dir. Add to list of files to generate.
+                    missing[key].append("weights")
+            if any(missing[key].values()):
+                print(f"Missing files for {key}: ", missing[key])
+                ready = False
+            else:
+                ready = True
+        return ready
+
+    def get_mlir_from_turbine_tank(self, submodel, container_name):
+        from turbine_models.turbine_tank import downloadModelArtifacts
+
+        safe_name = utils.create_safe_name(
+            self.hf_model_name,
+            f"_{self.max_length}_{self.height}x{self.width}_{self.precision}_{submodel}.mlir",
+        )
+        mlir_path = downloadModelArtifacts(
+            safe_name,
+            container_name,
+        )
+        return mlir_path
+
+    # IMPORT / COMPILE PHASE
+
+    def export_submodel(
+        self,
+        submodel: str,
+        weights_only: bool = False,
+    ):
+        if not os.path.exists(self.pipeline_dir):
+            os.makedirs(self.pipeline_dir)
+
+        if self.map[submodel]["external_weights"] and self.external_weights_dir:
+            if not os.path.exists(self.external_weights_dir):
+                os.makedirs(self.external_weights_dir, exist_ok=False)
+
+            self.map[submodel]["weights"] = os.path.join(
+                self.external_weights_dir,
+                f"{submodel}_{self.map[submodel]['precision']}."
+                + self.map["submodel"]["external_weights"],
+            )
+
+        elif not self.map["submodel"]["external_weights"]:
+            print(
+                "No external weights type specified using --external_weights, weights for imported .mlir files will not be externalized."
+            )
+            self.map[submodel]["weights"] = None
+
+        if weights_only:
+            input_mlir = None
+        elif "mlir" in self.map[submodel].keys():
+            input_mlir = self.map[submodel]["mlir"]
+        elif self.download:
+            try:
+                input_mlir = self.get_mlir_from_turbine_tank(
+                    submodel, self.tank_container
+                )
+            except:
+                input_mlir = None
+        else:
+            input_mlir = None
+        self.map[submodel]["mlir"] = input_mlir
+
+        match submodel:
+            case "unetloop": #SDXL ONLY FOR NOW
+                pipeline_file = get_pipeline_ir(
+                    self.width,
+                    self.height,
+                    self.precision,
+                    self.batch_size,
+                    self.max_length,
+                    "unet_loop",
+                )
+                pipeline_keys = [
+                    utils.create_safe_name(self.hf_model_name.split("/")[-1], ""),
+                    "bs" + str(self.batch_size),
+                    f"{str(self.width)}x{str(self.height)}",
+                    self.precision,
+                    str(self.max_length),
+                    "unetloop",
+                ]
+                vmfb_path = utils.compile_to_vmfb(
+                    pipeline_file,
+                    self.map["unet"]["device"],
+                    self.map["unet"]["target"],
+                    self.ireec_flags["pipeline"],
+                    os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
+                    return_path=True,
+                    mlir_source="str",
+                )
+                self.map[submodel]["vmfb"] = vmfb_path
+                self.map[submodel]["weights"] = None
+            case "fullpipeline": #SDXL ONLY FOR NOW
+                pipeline_file = get_pipeline_ir(
+                    self.width,
+                    self.height,
+                    self.precision,
+                    self.batch_size,
+                    self.max_length,
+                    "tokens_to_image",
+                )
+                pipeline_keys = [
+                    utils.create_safe_name(self.hf_model_name.split("/")[-1], ""),
+                    "bs" + str(self.batch_size),
+                    f"{str(self.width)}x{str(self.height)}",
+                    self.precision,
+                    str(self.max_length),
+                    "fullpipeline",
+                ]
+                vmfb_path = utils.compile_to_vmfb(
+                    pipeline_file,
+                    self.map["unet"]["device"],
+                    self.map["unet"]["target"],
+                    self.ireec_flags["pipeline"],
+                    os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
+                    return_path=True,
+                    mlir_source="str",
+                )
+                self.map[submodel]["vmfb"] = vmfb_path
+                self.map[submodel]["weights"] = None
+            case _:
+                export_args = dict(**self.map[submodel]["export_args"])
+                export_args["input_mlir"] = self.map[submodel].get("mlir")
+                vmfb_path = self.map[submodel]["export"](*export_args)
+
+    # LOAD
+    def load_map(self):
+        for submodel in self.map.keys():
+            self.load_submodel(submodel)
+
+    def load_submodel(self, submodel):
+        if not self.map[submodel].get("vmfb"):
+            raise ValueError(f"VMFB not found for {submodel}.")
+        if not self.map[submodel].get("weights") and self.map[submodel].get(
+            "external_weights"
+        ):
+            raise ValueError(f"Weights not found for {submodel}.")
+        self.map[submodel]["runner"] = PipelineComponent()
+        self.map[submodel]["runner"].load(
+            self.map[submodel]["driver"],
+            self.map[submodel]["vmfb"],
+            self.map[submodel]["module_name"],
+            self.map[submodel].get("weights"),
+            self.map[submodel].get("extra_plugin"),
+        )
+        setattr(self, submodel, self.map[submodel]["runner"])
+
+    def unload_submodel(self, submodel):
+        self.map[submodel]["runner"].unload()
+        setattr(self, submodel, None)
+
+
+def numpy_to_pil_image(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = []
+        for batched_image in images:
+            for image in range(0, batched_image.size(dim=0)):
+                pil_images.append(Image.fromarray(image.squeeze(), mode="L"))
+    else:
+        pil_images = []
+        for image in images:
+            pil_images.append(Image.fromarray(image))
+    return pil_images
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
index 256e4d21b..1068d6b6c 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -123,7 +123,7 @@ def __init__(
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
-        self.custom_vae = custom_vae
+        self.custom_vae = None
         self.cpu_scheduling = cpu_scheduling
         self.torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
         self.vae_precision = vae_precision if vae_precision else self.precision

From 7fabc3cb0490cbe87576ea56a27bceee1d40a121 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 24 Jun 2024 13:09:46 -0500
Subject: [PATCH 146/174] Flag-guard padded attention preprocessing
 instruction, start adding tests for abstracted pipeline

---
 .../custom_models/sd_inference/utils.py       | 10 +++
 .../custom_models/sd_inference/vae.py         |  1 -
 models/turbine_models/tests/pipeline_test.py  | 69 +++++++++++++++++++
 models/turbine_models/tests/sd3_test.py       |  2 +-
 4 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 models/turbine_models/tests/pipeline_test.py

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 8822d0144..ec616f125 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -68,8 +68,13 @@
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-codegen-llvmgpu-use-vector-distribution=true",
         "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
+    ],
+    "pad_attention": [
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics, iree-linalg-ext-pad-attention{pad-to-multiple-of=0,64,0,32,0}))",
     ],
+    "preprocess_default": [
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics))",
+    ]
     "unet": [""],
     "clip": [""],
     "vae": [""],
@@ -121,6 +126,7 @@ def compile_to_vmfb(
     save_mlir=True,
     attn_spec=None,
     winograd=False,
+    masked_attention=False,
 ):
     flags = []
     if mlir_source == "file" and not isinstance(module_str, str):
@@ -205,6 +211,10 @@ def compile_to_vmfb(
 
     if "gfx11" in target_triple:
         flags.extend(GFX11_flags["all"])
+        if masked_attention:
+            flags.extend(GFX11_flags["pad_attention"])
+        else:
+            flags.extend(GFX11_flags["preprocess_default"])
 
     # Currently, we need a transform dialect script to be applied to the compilation through IREE in certain cases.
     # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index bd9e99a23..475cf1d1d 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -7,7 +7,6 @@
 import os
 import sys
 
-from iree import runtime as ireert
 from iree.compiler.ir import Context
 import numpy as np
 from shark_turbine.aot import *
diff --git a/models/turbine_models/tests/pipeline_test.py b/models/turbine_models/tests/pipeline_test.py
new file mode 100644
index 000000000..5c7d21011
--- /dev/null
+++ b/models/turbine_models/tests/pipeline_test.py
@@ -0,0 +1,69 @@
+# Copyright 2024 Advanced Micro Devices, inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import pytest
+import unittest
+import torch
+import os
+import numpy as np
+from iree.compiler.ir import Context
+from shark_turbine.aot import *
+from turbine_models.custom_models.sd_inference import utils
+
+class TestModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 10)
+        self.fc2 = torch.nn.Linear(10, 10)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+torch.no_grad()
+def export_dummy_model():
+    model = TestModule()
+    target = "x86_64-unknown-linux-gnu"
+    device = "llvm-cpu"
+    model_metadata = {
+        'model_name': "TestModel2xLinear",
+        'input_shapes': [(10,)],
+        'input_dtypes': ["float32"],
+        'output_shapes': [(10,)],
+        'output_dtypes': ["float32"],
+        'test_kwarg_1': 'test_kwarg_1_value',
+        'test_kwarg_2': 'test_kwarg_2_value',
+    }
+    dummy_input = torch.empty(10)
+    safe_name = model_metadata['model_name'].replace('/', '_')
+    vmfb_path = f"./{safe_name}.vmfb"
+
+    fxb = FxProgramsBuilder(model)
+
+    @fxb.export_program(args=(dummy_input,))
+    def _forward(module, inputs):
+        return module.forward(inputs)
+
+    class CompiledTester(CompiledModule):
+        forward = _forward
+    
+    inst = CompiledTester(context=Context(), import_to="IMPORT")
+    mlir_module = CompiledModule.get_mlir_module(inst)
+    breakpoint()
+
+        
+
+    
+# class PipelineTest(unittest.TestCase):
+#     def setUp(self):
+#         model_map = {
+#             'test_model_1':
+#         }
+
+if __name__ == "__main__":
+    export_dummy_model()
\ No newline at end of file
diff --git a/models/turbine_models/tests/sd3_test.py b/models/turbine_models/tests/sd3_test.py
index b1fc664ac..95309947d 100644
--- a/models/turbine_models/tests/sd3_test.py
+++ b/models/turbine_models/tests/sd3_test.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Nod Labs, Inc
+# Copyright 2024 Advanced Micro Devices, Inc.
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.

From 74f677f4b4b8327ea1d1590b057bcee394c2d12c Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 24 Jun 2024 13:11:31 -0500
Subject: [PATCH 147/174] Add missing comma.

---
 models/turbine_models/custom_models/sd_inference/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index ec616f125..5420bd9fc 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -74,7 +74,7 @@
     ],
     "preprocess_default": [
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics))",
-    ]
+    ],
     "unet": [""],
     "clip": [""],
     "vae": [""],

From bac7c63e8ad894df41501a28f0af42b934a501ad Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 24 Jun 2024 13:23:18 -0500
Subject: [PATCH 148/174] propagate change to pipeline wrapper names.

---
 .../sdxl_inference/sdxl_compiled_pipeline.py  | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 5a6d66d77..90b16d7b6 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -40,7 +40,8 @@
     "clip": None,
     "unet": None,
     "vae": None,
-    "pipeline": None,
+    "unetloop": None,
+    "fullpipeline": None,
 }
 
 
@@ -161,7 +162,7 @@ def check_prepared(
                         if weights[submodel] is None:
                             weights[submodel] = weight
                     elif weights[submodel] is None and not any(
-                        x in submodel for x in ["pipeline", "scheduler"]
+                        x in submodel for x in ["unetloop", "scheduler"]
                     ):
                         _, weight = self.export_submodel(submodel, weights_only=True)
                         weights[submodel] = weight
@@ -351,8 +352,8 @@ def export_submodel(
                 "prompt_encoder": None,
                 "scheduled_unet": None,
                 "unet": None,
-                "pipeline": None,
-                "full_pipeline": None,
+                "unetloop": None,
+                "fullpipeline": None,
             }
         match submodel:
             case "scheduled_unet":
@@ -515,7 +516,7 @@ def export_submodel(
                     pipeline_file,
                     self.devices["unet"]["device"],
                     self.devices["unet"]["target"],
-                    self.ireec_flags["pipeline"],
+                    self.ireec_flags["unetloop"],
                     os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
                     return_path=True,
                     mlir_source="str",
@@ -542,7 +543,7 @@ def export_submodel(
                     pipeline_file,
                     self.devices["unet"]["device"],
                     self.devices["unet"]["target"],
-                    self.ireec_flags["pipeline"],
+                    self.ireec_flags["unetloop"],
                     os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
                     return_path=True,
                     mlir_source="str",
@@ -607,7 +608,7 @@ def load_pipeline(
                     vmfbs["scheduled_unet"],
                     vmfbs["prompt_encoder"],
                     vmfbs["vae_decode"],
-                    vmfbs["full_pipeline"],
+                    vmfbs["fullpipeline"],
                 ],
                 [
                     weights["scheduled_unet"],
@@ -626,7 +627,7 @@ def load_pipeline(
                 self.devices["unet"]["driver"],
                 [
                     vmfbs["scheduled_unet"],
-                    vmfbs["pipeline"],
+                    vmfbs["unetloop"],
                     vmfbs["vae_decode"],
                     vmfbs["prompt_encoder"],
                 ],
@@ -1002,8 +1003,8 @@ def numpy_to_pil_image(images):
     if args.split_scheduler:
         map["unet"] = None
         map.pop("scheduled_unet")
-        map.pop("pipeline")
-        map.pop("full_pipeline")
+        map.pop("unetloop")
+        map.pop("fullpipeline")
     mlirs = copy.deepcopy(map)
     vmfbs = copy.deepcopy(map)
     weights = copy.deepcopy(map)
@@ -1042,7 +1043,7 @@ def numpy_to_pil_image(images):
         "clip": args.ireec_flags + args.clip_flags,
         "unet": args.ireec_flags + args.unet_flags,
         "vae": args.ireec_flags + args.vae_flags,
-        "pipeline": args.ireec_flags,
+        "unetloop": args.ireec_flags,
         "scheduler": args.ireec_flags,
     }
     if not args.pipeline_dir:

From 4eca3b28dc910cebcf2d2b6d79fce99b5027cd73 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 24 Jun 2024 14:19:05 -0500
Subject: [PATCH 149/174] Add paths to downloads for specs without masked
 attention.

---
 .../custom_models/pipeline_base.py            |  4 +--
 .../custom_models/sd_inference/utils.py       | 21 ++++++++++----
 models/turbine_models/tests/pipeline_test.py  | 29 +++++++++++--------
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
index 990165bad..aee25c6c2 100644
--- a/models/turbine_models/custom_models/pipeline_base.py
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -392,7 +392,7 @@ def export_submodel(
         self.map[submodel]["mlir"] = input_mlir
 
         match submodel:
-            case "unetloop": #SDXL ONLY FOR NOW
+            case "unetloop":  # SDXL ONLY FOR NOW
                 pipeline_file = get_pipeline_ir(
                     self.width,
                     self.height,
@@ -420,7 +420,7 @@ def export_submodel(
                 )
                 self.map[submodel]["vmfb"] = vmfb_path
                 self.map[submodel]["weights"] = None
-            case "fullpipeline": #SDXL ONLY FOR NOW
+            case "fullpipeline":  # SDXL ONLY FOR NOW
                 pipeline_file = get_pipeline_ir(
                     self.width,
                     self.height,
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 5420bd9fc..dc9c58d0d 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -221,10 +221,14 @@ def compile_to_vmfb(
     # This is a temporary solution, and should be removed or largely disabled once the functionality of
     # the TD spec is implemented in C++.
     if attn_spec in ["default", "mfma"]:
-        attn_spec = get_mfma_spec_path(target_triple, os.path.dirname(safe_name))
+        attn_spec = get_mfma_spec_path(
+            target_triple, os.path.dirname(safe_name), masked_attention
+        )
         flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
     elif attn_spec in ["wmma"] or ("gfx11" in target_triple and not attn_spec):
-        attn_spec = get_wmma_spec_path(target_triple, os.path.dirname(safe_name))
+        attn_spec = get_wmma_spec_path(
+            target_triple, os.path.dirname(safe_name), masked_attention
+        )
         if attn_spec:
             flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
     elif attn_spec and attn_spec != "None":
@@ -294,8 +298,11 @@ def create_safe_name(hf_model_name, model_name_str):
     return safe_name
 
 
-def get_mfma_spec_path(target_chip, save_dir):
-    url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx942.mlir"
+def get_mfma_spec_path(target_chip, save_dir, masked_attention=False):
+    if not masked_attention:
+        url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/no_pad/attention_and_matmul_spec_mfma.mlir"
+    else:
+        url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx942.mlir"
     attn_spec = urlopen(url).read().decode("utf-8")
     spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
     if os.path.exists(spec_path):
@@ -305,8 +312,10 @@ def get_mfma_spec_path(target_chip, save_dir):
     return spec_path
 
 
-def get_wmma_spec_path(target_chip, save_dir):
-    if target_chip == "gfx1100":
+def get_wmma_spec_path(target_chip, save_dir, masked_attention=False):
+    if not masked_attention:
+        url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/no_pad/attention_and_matmul_spec_wmma.mlir"
+    elif target_chip == "gfx1100":
         url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx1100.mlir"
     elif target_chip in ["gfx1103", "gfx1150"]:
         url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx1150.mlir"
diff --git a/models/turbine_models/tests/pipeline_test.py b/models/turbine_models/tests/pipeline_test.py
index 5c7d21011..a9046073f 100644
--- a/models/turbine_models/tests/pipeline_test.py
+++ b/models/turbine_models/tests/pipeline_test.py
@@ -13,6 +13,8 @@
 from iree.compiler.ir import Context
 from shark_turbine.aot import *
 from turbine_models.custom_models.sd_inference import utils
+from shark_turbine.transforms import FuncOpMatcher, Pass
+
 
 class TestModule(torch.nn.Module):
     def __init__(self):
@@ -25,22 +27,25 @@ def forward(self, x):
         x = self.fc2(x)
         return x
 
+
 torch.no_grad()
+
+
 def export_dummy_model():
     model = TestModule()
     target = "x86_64-unknown-linux-gnu"
     device = "llvm-cpu"
     model_metadata = {
-        'model_name': "TestModel2xLinear",
-        'input_shapes': [(10,)],
-        'input_dtypes': ["float32"],
-        'output_shapes': [(10,)],
-        'output_dtypes': ["float32"],
-        'test_kwarg_1': 'test_kwarg_1_value',
-        'test_kwarg_2': 'test_kwarg_2_value',
+        "model_name": "TestModel2xLinear",
+        "input_shapes": [(10,)],
+        "input_dtypes": ["float32"],
+        "output_shapes": [(10,)],
+        "output_dtypes": ["float32"],
+        "test_kwarg_1": "test_kwarg_1_value",
+        "test_kwarg_2": "test_kwarg_2_value",
     }
     dummy_input = torch.empty(10)
-    safe_name = model_metadata['model_name'].replace('/', '_')
+    safe_name = model_metadata["model_name"].replace("/", "_")
     vmfb_path = f"./{safe_name}.vmfb"
 
     fxb = FxProgramsBuilder(model)
@@ -51,14 +56,14 @@ def _forward(module, inputs):
 
     class CompiledTester(CompiledModule):
         forward = _forward
-    
+
     inst = CompiledTester(context=Context(), import_to="IMPORT")
     mlir_module = CompiledModule.get_mlir_module(inst)
+    funcop_pass = Pass(mlir_module.operation)
+
     breakpoint()
 
-        
 
-    
 # class PipelineTest(unittest.TestCase):
 #     def setUp(self):
 #         model_map = {
@@ -66,4 +71,4 @@ class CompiledTester(CompiledModule):
 #         }
 
 if __name__ == "__main__":
-    export_dummy_model()
\ No newline at end of file
+    export_dummy_model()

From 4906549ecd5f4675b70cbe3c5deaa1e25ae172e4 Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 24 Jun 2024 18:14:39 -0500
Subject: [PATCH 150/174] Add test to and fixes for pipeline base

---
 .../custom_models/pipeline_base.py            |  79 ++++++++-----
 models/turbine_models/tests/pipeline_test.py  | 104 +++++++++++++++---
 2 files changed, 138 insertions(+), 45 deletions(-)

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
index aee25c6c2..24973e548 100644
--- a/models/turbine_models/custom_models/pipeline_base.py
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -27,10 +27,12 @@
 def merge_arg_into_map(model_map, arg, arg_name):
     if isinstance(arg, dict):
         for key in arg.keys():
-            model_map[key][arg_name] = arg[key]
+            if not model_map[key].get(arg_name):
+                model_map[key][arg_name] = arg[key]
     else:
         for key in model_map.keys():
-            model_map[key][arg_name] = arg
+            if not model_map[key].get(arg_name):
+                model_map[key][arg_name] = arg
     return model_map
 
 
@@ -65,15 +67,20 @@ def load(
         )
         self.device = self.runner.config.device
         self.module = getattr(self.runner.ctx.modules, module_name)
-        if "get_metadata" in self.module.keys():
-            self.metadata = self.module["get_metadata"]()
+        self.metadata = None
 
     def unload(self):
         self.device = None
         self.runner = None
         gc.collect()
 
+    def get_metadata(self, function_name):
+        if not self.metadata:
+            self.metadata = self.module[function_name].vm_function.reflection
+        return self.metadata
+
     def _run(self, function_name, inputs: list):
+        print(inputs)
         return self.module[function_name](*inputs)
 
     def _run_and_benchmark(self, function_name, inputs: list):
@@ -84,17 +91,22 @@ def _run_and_benchmark(self, function_name, inputs: list):
         return output
 
     def __call__(self, function_name, inputs: list):
+        casted_output = False
         if not isinstance(inputs, list):
             inputs = [inputs]
         if self.benchmark:
             output = self._run_and_benchmark(function_name, inputs)
         else:
             output = self._run(function_name, inputs)
-        if output.dtype() != self.output_dtype:
+        if output.dtype != self.output_dtype:
+            casted_output = True
             output = output.astype(self.output_dtype)
-
         match self.output_type:
             case ireert.DeviceArray:
+                if casted_output:
+                    output = ireert.asdevicearray(
+                        self.device, output, self.output_dtype
+                    )
                 return output
             case torch.Tensor:
                 return torch.tensor(output.to_host())
@@ -159,7 +171,7 @@ def __init__(
         precision: str | dict[str] = "fp16",
         td_spec: str | dict[str] = None,
         decomp_attn: bool | dict[bool] = False,
-        external_weights: str | dict[str] = "safetensors",
+        external_weights: str | dict[str] = None,
         pipeline_dir: str = "./shark_vmfbs",
         external_weights_dir: str = "./shark_weights",
     ):
@@ -174,21 +186,17 @@ def __init__(
                 assert (
                     submodel in iree_target_triple.keys()
                 ), f"Target arch for {submodel} not found."
-                self.map[submodel]["device"] = (device[submodel],)
-                self.map[submodel]["driver"] = (
-                    utils.iree_device_map(device[submodel]),
-                )
+                self.map[submodel]["device"] = device[submodel]
+                self.map[submodel]["driver"] = utils.iree_device_map(device[submodel])
                 self.map[submodel]["target"] = iree_target_triple[submodel]
         else:
             assert isinstance(
                 iree_target_triple, str
             ), "Device and target triple must be both dicts or both strings."
             for submodel in self.map.keys():
-                self.map[submodel]["device"] = (device[submodel],)
-                self.map[submodel]["driver"] = (
-                    utils.iree_device_map(device[submodel]),
-                )
-                self.map[submodel]["target"] = iree_target_triple[submodel]
+                self.map[submodel]["device"] = device
+                self.map[submodel]["driver"] = utils.iree_device_map(device)
+                self.map[submodel]["target"] = iree_target_triple
         map_arguments = {
             "ireec_flags": ireec_flags,
             "precision": precision,
@@ -216,7 +224,11 @@ def __init__(
         print(self.map)
 
         self.pipeline_dir = pipeline_dir
+        if not os.path.exists(self.pipeline_dir):
+            os.makedirs(self.pipeline_dir)
         self.external_weights_dir = external_weights_dir
+        if not os.path.exists(self.external_weights_dir):
+            os.makedirs(self.external_weights_dir)
 
         # Disabled for now -- enable through option when turbine tank is ready.
         self.download = False
@@ -234,10 +246,10 @@ def __init__(
 
     def prepare_all(
         self,
-        mlirs: dict,
-        vmfbs: dict,
-        weights: dict,
-        interactive: bool = True,
+        mlirs: dict = {},
+        vmfbs: dict = {},
+        weights: dict = {},
+        interactive: bool = False,
     ):
         ready = self.is_prepared(vmfbs, weights)
         match ready:
@@ -263,6 +275,7 @@ def prepare_all(
 
     def is_prepared(self, vmfbs, weights):
         missing = {}
+        ready = False
         pipeline_dir = self.pipeline_dir
         for key in self.map:
             missing[key] = []
@@ -301,6 +314,8 @@ def is_prepared(self, vmfbs, weights):
 
             # Make sure vmfb needs external weights, as they may be inlined.
             if self.map[key].get("external_weights"):
+                if self.map[key]["external_weights"]:
+                    continue
                 if self.map[key].get("weights"):
                     # weights already found in model map
                     continue
@@ -330,11 +345,10 @@ def is_prepared(self, vmfbs, weights):
                 else:
                     # weights not found in external_weights_dir. Add to list of files to generate.
                     missing[key].append("weights")
-            if any(missing[key].values()):
-                print(f"Missing files for {key}: ", missing[key])
-                ready = False
-            else:
-                ready = True
+        if not any(x for x in missing.values()):
+            ready = True
+        else:
+            print("Missing files: ", missing)
         return ready
 
     def get_mlir_from_turbine_tank(self, submodel, container_name):
@@ -355,6 +369,7 @@ def get_mlir_from_turbine_tank(self, submodel, container_name):
     def export_submodel(
         self,
         submodel: str,
+        input_mlir: str = None,
         weights_only: bool = False,
     ):
         if not os.path.exists(self.pipeline_dir):
@@ -367,10 +382,10 @@ def export_submodel(
             self.map[submodel]["weights"] = os.path.join(
                 self.external_weights_dir,
                 f"{submodel}_{self.map[submodel]['precision']}."
-                + self.map["submodel"]["external_weights"],
+                + self.map[submodel]["external_weights"],
             )
 
-        elif not self.map["submodel"]["external_weights"]:
+        elif not self.map[submodel].get("external_weights"):
             print(
                 "No external weights type specified using --external_weights, weights for imported .mlir files will not be externalized."
             )
@@ -449,9 +464,13 @@ def export_submodel(
                 self.map[submodel]["vmfb"] = vmfb_path
                 self.map[submodel]["weights"] = None
             case _:
-                export_args = dict(**self.map[submodel]["export_args"])
-                export_args["input_mlir"] = self.map[submodel].get("mlir")
-                vmfb_path = self.map[submodel]["export"](*export_args)
+                export_args = self.map[submodel].get("export_args", {})
+                if self.map[submodel].get("input_mlir"):
+                    export_args["input_mlir"] = self.map[submodel].get("mlir")
+                if export_args:
+                    vmfb_path = self.map[submodel]["export_fn"](**export_args)
+                else:
+                    vmfb_path = self.map[submodel]["export_fn"]()
 
     # LOAD
     def load_map(self):
diff --git a/models/turbine_models/tests/pipeline_test.py b/models/turbine_models/tests/pipeline_test.py
index a9046073f..1d0929516 100644
--- a/models/turbine_models/tests/pipeline_test.py
+++ b/models/turbine_models/tests/pipeline_test.py
@@ -13,7 +13,11 @@
 from iree.compiler.ir import Context
 from shark_turbine.aot import *
 from turbine_models.custom_models.sd_inference import utils
-from shark_turbine.transforms import FuncOpMatcher, Pass
+from turbine_models.custom_models.pipeline_base import (
+    PipelineComponent,
+    TurbinePipelineBase,
+)
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 
 
 class TestModule(torch.nn.Module):
@@ -35,17 +39,22 @@ def export_dummy_model():
     model = TestModule()
     target = "x86_64-unknown-linux-gnu"
     device = "llvm-cpu"
-    model_metadata = {
+    model_metadata_forward = {
         "model_name": "TestModel2xLinear",
-        "input_shapes": [(10,)],
+        "input_shapes": [10],
         "input_dtypes": ["float32"],
-        "output_shapes": [(10,)],
+        "output_shapes": [10],
         "output_dtypes": ["float32"],
         "test_kwarg_1": "test_kwarg_1_value",
         "test_kwarg_2": "test_kwarg_2_value",
     }
     dummy_input = torch.empty(10)
-    safe_name = model_metadata["model_name"].replace("/", "_")
+    safe_keys = [
+        model_metadata_forward["model_name"],
+        "fp32",
+        "bs1",
+    ]
+    safe_name = "_".join(safe_keys)
     vmfb_path = f"./{safe_name}.vmfb"
 
     fxb = FxProgramsBuilder(model)
@@ -59,16 +68,81 @@ class CompiledTester(CompiledModule):
 
     inst = CompiledTester(context=Context(), import_to="IMPORT")
     mlir_module = CompiledModule.get_mlir_module(inst)
-    funcop_pass = Pass(mlir_module.operation)
+    metadata_pass = AddMetadataPass(mlir_module)
+    mlir_module = metadata_pass.run(model_metadata_forward, "forward")
+    vmfb_path = utils.compile_to_vmfb(
+        str(mlir_module),
+        device,
+        target,
+        None,
+        safe_name + "_" + target,
+        return_path=True,
+    )
+    return vmfb_path
+
+
+class TestPipeline(TurbinePipelineBase):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+    def run(self, inputs: list):
+        return self.test_model_1("forward", *inputs)
+
+
+class PipelineTest(unittest.TestCase):
+    def setUp(self):
+        model_map = {
+            "test_model_1": {
+                "model_name": "TestModel1",
+                "external_weights": None,
+                "module_name": "compiled_tester",
+                "safe_name": "TestModel2xLinear",
+                "keywords": ["Test", "Model", "2x", "Linear"],
+                "export_fn": export_dummy_model,
+                "export_args": None,
+            }
+        }
+        self.model_metadata_forward = {
+            "model_name": "TestModel2xLinear",
+            "input_shapes": {"0": "10,"},
+            "input_dtypes": {"0": "float32"},
+            "output_shapes": {"0": "10,"},
+            "output_dtypes": {"0": "float32"},
+            "test_kwarg_1": "test_kwarg_1_value",
+            "test_kwarg_2": "test_kwarg_2_value",
+        }
+        self.pipe = TestPipeline(
+            model_map=model_map,
+            batch_size=1,
+            device="cpu",
+            iree_target_triple="x86_64-unknown-linux-gnu",
+            pipeline_dir="./",
+            precision="fp32",
+        )
+        self.pipe.prepare_all()
+        self.pipe.load_map()
+        self.test_input = [torch.ones(10)]
+
+    def test_pipeline(self):
+        output = self.pipe.run(self.test_input).to_host()
+        print(output)
+
+    def test_pipeline_benchmark(self):
+        self.pipe.test_model_1.benchmark = True
+        output = self.pipe.run(self.test_input).to_host()
+        print(output)
+
+    def test_pipeline_metadata(self):
+        metadata = self.pipe.test_model_1.get_metadata("forward")
+        assert (
+            self.model_metadata_forward.keys() == metadata.keys()
+        ), "Metadata keys mismatch: expected {}, got {}".format(
+            self.model_metadata_forward.keys(), metadata.keys()
+        )
 
-    breakpoint()
-
-
-# class PipelineTest(unittest.TestCase):
-#     def setUp(self):
-#         model_map = {
-#             'test_model_1':
-#         }
 
 if __name__ == "__main__":
-    export_dummy_model()
+    unittest.main()

From 9eff4439bb4880cedc6c9d473d0a4bd282ed934b Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 24 Jun 2024 18:16:31 -0500
Subject: [PATCH 151/174] Flag-guard pad attention for instinct as well.

---
 .../turbine_models/custom_models/sd_inference/utils.py   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index dc9c58d0d..cf6b5946a 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -35,8 +35,13 @@
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-rocm-waves-per-eu=2",
         "--iree-flow-inline-constants-max-byte-length=1",
+    ],
+    "pad_attention": [
         "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics, iree-linalg-ext-pad-attention{pad-to-multiple-of=0,128,0,32,0}))",
     ],
+    "preprocess_default": [
+        "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-global-opt-raise-special-ops, util.func(iree-preprocessing-pad-to-intrinsics))",
+    ],
     "unet": [
         "--iree-flow-enable-aggressive-fusion",
         "--iree-global-opt-enable-fuse-horizontal-contractions=true",
@@ -208,6 +213,10 @@ def compile_to_vmfb(
         elif "vae" in safe_name:
             flags.extend(MI_flags["vae"])
         flags.extend(MI_flags["all"])
+        if masked_attention:
+            flags.extend(GFX11_flags["pad_attention"])
+        else:
+            flags.extend(GFX11_flags["preprocess_default"])
 
     if "gfx11" in target_triple:
         flags.extend(GFX11_flags["all"])

From df403097661d771893029040cd2c3540790dfd5d Mon Sep 17 00:00:00 2001
From: Ean Garvey <garveyej@gmail.com>
Date: Mon, 24 Jun 2024 19:12:34 -0500
Subject: [PATCH 152/174] Verify metadata results exactly and fix pass
 invocation to match upstream.

---
 models/turbine_models/tests/pipeline_test.py | 41 ++++++++------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/models/turbine_models/tests/pipeline_test.py b/models/turbine_models/tests/pipeline_test.py
index 1d0929516..76e33c96a 100644
--- a/models/turbine_models/tests/pipeline_test.py
+++ b/models/turbine_models/tests/pipeline_test.py
@@ -19,6 +19,16 @@
 )
 from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 
+model_metadata_forward = {
+    "model_name": "TestModel2xLinear",
+    "input_shapes": [10],
+    "input_dtypes": ["float32"],
+    "output_shapes": [10],
+    "output_dtypes": ["float32"],
+    "test_kwarg_1": "test_kwarg_1_value",
+    "test_kwarg_2": "test_kwarg_2_value",
+}
+
 
 class TestModule(torch.nn.Module):
     def __init__(self):
@@ -39,15 +49,7 @@ def export_dummy_model():
     model = TestModule()
     target = "x86_64-unknown-linux-gnu"
     device = "llvm-cpu"
-    model_metadata_forward = {
-        "model_name": "TestModel2xLinear",
-        "input_shapes": [10],
-        "input_dtypes": ["float32"],
-        "output_shapes": [10],
-        "output_dtypes": ["float32"],
-        "test_kwarg_1": "test_kwarg_1_value",
-        "test_kwarg_2": "test_kwarg_2_value",
-    }
+
     dummy_input = torch.empty(10)
     safe_keys = [
         model_metadata_forward["model_name"],
@@ -68,8 +70,7 @@ class CompiledTester(CompiledModule):
 
     inst = CompiledTester(context=Context(), import_to="IMPORT")
     mlir_module = CompiledModule.get_mlir_module(inst)
-    metadata_pass = AddMetadataPass(mlir_module)
-    mlir_module = metadata_pass.run(model_metadata_forward, "forward")
+    mlir_module = AddMetadataPass(mlir_module, model_metadata_forward, "forward").run()
     vmfb_path = utils.compile_to_vmfb(
         str(mlir_module),
         device,
@@ -105,15 +106,6 @@ def setUp(self):
                 "export_args": None,
             }
         }
-        self.model_metadata_forward = {
-            "model_name": "TestModel2xLinear",
-            "input_shapes": {"0": "10,"},
-            "input_dtypes": {"0": "float32"},
-            "output_shapes": {"0": "10,"},
-            "output_dtypes": {"0": "float32"},
-            "test_kwarg_1": "test_kwarg_1_value",
-            "test_kwarg_2": "test_kwarg_2_value",
-        }
         self.pipe = TestPipeline(
             model_map=model_map,
             batch_size=1,
@@ -137,10 +129,11 @@ def test_pipeline_benchmark(self):
 
     def test_pipeline_metadata(self):
         metadata = self.pipe.test_model_1.get_metadata("forward")
-        assert (
-            self.model_metadata_forward.keys() == metadata.keys()
-        ), "Metadata keys mismatch: expected {}, got {}".format(
-            self.model_metadata_forward.keys(), metadata.keys()
+        expected = model_metadata_forward
+        for i in expected.keys():
+            expected[i] = str(expected[i])
+        assert expected == metadata, "Metadata mismatch: expected {}, got {}".format(
+            expected, metadata
         )
 
 

From 39d8551dbe4b1a12b8ddaed79f13f0aa9c8d066b Mon Sep 17 00:00:00 2001
From: IanNod <45800100+IanNod@users.noreply.github.com>
Date: Fri, 28 Jun 2024 11:05:39 -0700
Subject: [PATCH 153/174] Bug fix for specified device when exporting submodels

Exporting models was using "devices" instead of "driver" which caused errors on compiling given a specified rocm://<> device. Changed to fix bug
---
 .../sdxl_inference/sdxl_compiled_pipeline.py           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 90b16d7b6..7d266a875 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -375,7 +375,7 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     unet_external_weight_path,
-                    self.devices["unet"]["device"],
+                    self.devices["unet"]["driver"],
                     self.devices["unet"]["target"],
                     self.ireec_flags["unet"],
                     self.decomp_attn,
@@ -404,7 +404,7 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     unet_external_weight_path,
-                    self.devices["unet"]["device"],
+                    self.devices["unet"]["driver"],
                     self.devices["unet"]["target"],
                     self.ireec_flags["unet"],
                     self.decomp_attn,
@@ -429,7 +429,7 @@ def export_submodel(
                         self.num_inference_steps,
                         self.precision,
                         "vmfb",
-                        self.devices["unet"]["device"],
+                        self.devices["unet"]["driver"],
                         self.devices["unet"]["target"],
                         self.ireec_flags["scheduler"],
                         exit_on_vmfb=False,
@@ -460,7 +460,7 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     vae_external_weight_path,
-                    self.devices["vae"]["device"],
+                    self.devices["vae"]["driver"],
                     self.devices["vae"]["target"],
                     self.ireec_flags["vae"],
                     "decode",
@@ -482,7 +482,7 @@ def export_submodel(
                     "vmfb",
                     self.external_weights,
                     prompt_encoder_external_weight_path,
-                    self.devices["clip"]["device"],
+                    self.devices["clip"]["driver"],
                     self.devices["clip"]["target"],
                     self.ireec_flags["clip"],
                     exit_on_vmfb=False,

From 2da92157f8dfab5833402c95837f040570ac69f6 Mon Sep 17 00:00:00 2001
From: IanNod <45800100+IanNod@users.noreply.github.com>
Date: Fri, 28 Jun 2024 11:51:05 -0700
Subject: [PATCH 154/174] Missed device specified when exporting pipeline
 models

Missed a couple "devices" instead of "driver" changes
---
 .../custom_models/sdxl_inference/sdxl_compiled_pipeline.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 7d266a875..8e85539ab 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -514,7 +514,7 @@ def export_submodel(
                 ]
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
-                    self.devices["unet"]["device"],
+                    self.devices["unet"]["driver"],
                     self.devices["unet"]["target"],
                     self.ireec_flags["unetloop"],
                     os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
@@ -541,7 +541,7 @@ def export_submodel(
                 ]
                 pipeline_vmfb = utils.compile_to_vmfb(
                     pipeline_file,
-                    self.devices["unet"]["device"],
+                    self.devices["unet"]["driver"],
                     self.devices["unet"]["target"],
                     self.ireec_flags["unetloop"],
                     os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),

From f330d255c14dd0c0171f12811ccdb30aacb088cf Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Mon, 1 Jul 2024 20:53:20 +0000
Subject: [PATCH 155/174] Adds option for prompt encoder to use batched inputs

New flag batch_prompt_input determines if prompt encoder uses batchsize flag to concat output, or to batch the input shapes
---
 .../sdxl_inference/sdxl_cmd_opts.py           |  7 +++
 .../sdxl_inference/sdxl_compiled_pipeline.py  |  6 ++-
 .../sdxl_inference/sdxl_prompt_encoder.py     | 52 ++++++++++++-------
 3 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
index c1c21301b..368fb0d74 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
@@ -222,6 +222,13 @@ def is_valid_file(arg):
 ##############################################################################
 
 p.add_argument("--batch_size", type=int, default=1, help="Batch size for inference")
+p.add_argument(
+    "--batch_prompt_input",
+    type=bool,
+    default=False,
+    help="If batch size > 1 this enables batching the prompt encoder input rather than concating prompt encoders output",
+)
+
 p.add_argument(
     "--height", type=int, default=1024, help="Height of Stable Diffusion output image."
 )
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 8e85539ab..87c460849 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -68,6 +68,7 @@ def __init__(
         custom_vae: str = "",
         cpu_scheduling: bool = False,
         vae_precision: str = "fp32",
+        batch_prompt_input: bool = False,
     ):
         self.hf_model_name = hf_model_name
         self.scheduler_id = scheduler_id
@@ -76,6 +77,7 @@ def __init__(
         self.precision = precision
         self.max_length = max_length
         self.batch_size = batch_size
+        self.batch_prompt_input = batch_prompt_input
         self.num_inference_steps = num_inference_steps
         self.devices = {}
         if isinstance(device, dict):
@@ -492,7 +494,8 @@ def export_submodel(
                     input_mlir=input_mlir["prompt_encoder"],
                     attn_spec=self.attn_spec,
                     weights_only=weights_only,
-                    output_batchsize=self.batch_size,
+                    batch_size=self.batch_size,
+                    batch_input=self.batch_prompt_input,
                 )
                 return prompt_encoder_vmfb, prompt_encoder_external_weight_path
             case "unetloop":
@@ -1082,6 +1085,7 @@ def numpy_to_pil_image(images):
         args.vae_decomp_attn,
         custom_vae=None,
         vae_precision=args.vae_precision,
+        batch_prompt_input=args.batch_prompt_input,
     )
 
     vmfbs, weights = sdxl_pipe.check_prepared(mlirs, vmfbs, weights)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index ecfc4baf6..224e63233 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -25,6 +25,7 @@ def __init__(
         hf_auth_token=None,
         do_classifier_free_guidance=True,
         batch_size=1,
+        batch_input=False,
     ):
         super().__init__()
         self.torch_dtype = torch.float16 if precision == "fp16" else torch.float32
@@ -40,6 +41,7 @@ def __init__(
         )
         self.do_classifier_free_guidance = True
         self.batch_size = batch_size
+        self.batch_input = batch_input
 
     def forward(
         self, text_input_ids_1, text_input_ids_2, uncond_input_ids_1, uncond_input_ids_2
@@ -83,20 +85,25 @@ def forward(
             pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(
                 bs_embed * 1, -1
             )
-            prompt_embeds = prompt_embeds.repeat(self.batch_size, 1, 1)
+            if not self.batch_input:
+                prompt_embeds = prompt_embeds.repeat(self.batch_size, 1, 1)
             add_text_embeds = pooled_prompt_embeds
-            add_text_embeds = add_text_embeds.repeat(self.batch_size, 1)
+            if not self.batch_input:
+                add_text_embeds = add_text_embeds.repeat(self.batch_size, 1)
             if self.do_classifier_free_guidance:
-                neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(1, 1).view(
-                    1, -1
-                )
+                if not self.batch_input:
+                    neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(
+                        1, 1
+                    ).view(1, -1)
                 neg_prompt_embeds = neg_prompt_embeds.repeat(1, 1, 1)
                 neg_prompt_embeds = neg_prompt_embeds.view(bs_embed * 1, seq_len, -1)
-                neg_prompt_embeds = neg_prompt_embeds.repeat(self.batch_size, 1, 1)
+                if not self.batch_input:
+                    neg_prompt_embeds = neg_prompt_embeds.repeat(self.batch_size, 1, 1)
                 prompt_embeds = torch.cat([neg_prompt_embeds, prompt_embeds], dim=0)
-                neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(
-                    self.batch_size, 1
-                )
+                if not self.batch_input:
+                    neg_pooled_prompt_embeds = neg_pooled_prompt_embeds.repeat(
+                        self.batch_size, 1
+                    )
                 add_text_embeds = torch.cat(
                     [neg_pooled_prompt_embeds, add_text_embeds], dim=0
                 )
@@ -160,7 +167,8 @@ def export_prompt_encoder(
     input_mlir=None,
     attn_spec=None,
     weights_only=False,
-    output_batchsize=1,
+    batchsize=1,
+    batch_input=False,
 ):
     if "turbo" in hf_model_name:
         do_classifier_free_guidance = False
@@ -169,7 +177,7 @@ def export_prompt_encoder(
 
     safe_name = utils.create_safe_name(
         hf_model_name,
-        f"_bs{output_batchsize}_{str(max_length)}-{precision}-prompt-encoder-{device}",
+        f"_bs{batchsize}_{str(max_length)}-{precision}-prompt-encoder-{device}",
     )
     if pipeline_dir not in [None, ""]:
         safe_name = os.path.join(pipeline_dir, safe_name)
@@ -206,8 +214,14 @@ def export_prompt_encoder(
         precision,
         hf_auth_token,
         do_classifier_free_guidance,
-        batch_size=output_batchsize,
+        batch_size=batchsize,
+        batch_input=batch_input,
     )
+
+    input_batchsize = 1
+    if batch_input:
+        input_batchsize = batchsize
+
     if precision == "fp16":
         prompt_encoder_module = prompt_encoder_module.half()
     mapper = {}
@@ -232,10 +246,10 @@ class CompiledClip(CompiledModule):
 
         def encode_prompts(
             self,
-            t_ids_1=AbstractTensor(1, max_length, dtype=torch.int64),
-            t_ids_2=AbstractTensor(1, max_length, dtype=torch.int64),
-            uc_ids_1=AbstractTensor(1, max_length, dtype=torch.int64),
-            uc_ids_2=AbstractTensor(1, max_length, dtype=torch.int64),
+            t_ids_1=AbstractTensor(input_batchsize, max_length, dtype=torch.int64),
+            t_ids_2=AbstractTensor(input_batchsize, max_length, dtype=torch.int64),
+            uc_ids_1=AbstractTensor(input_batchsize, max_length, dtype=torch.int64),
+            uc_ids_2=AbstractTensor(input_batchsize, max_length, dtype=torch.int64),
         ):
             return jittable(prompt_encoder_module.forward)(
                 t_ids_1, t_ids_2, uc_ids_1, uc_ids_2
@@ -243,8 +257,8 @@ def encode_prompts(
 
         def encode_prompts_turbo(
             self,
-            t_ids_1=AbstractTensor(1, max_length, dtype=torch.int64),
-            t_ids_2=AbstractTensor(1, max_length, dtype=torch.int64),
+            t_ids_1=AbstractTensor(input_batchsize, max_length, dtype=torch.int64),
+            t_ids_2=AbstractTensor(input_batchsize, max_length, dtype=torch.int64),
         ):
             return jittable(prompt_encoder_module.forward_turbo)(t_ids_1, t_ids_2)
 
@@ -287,7 +301,7 @@ def encode_prompts_turbo(
         pipeline_dir=args.pipeline_dir,
         input_mlir=args.input_mlir,
         attn_spec=args.attn_spec,
-        output_batchsize=args.batch_size,
+        batchsize=args.batch_size,
     )
     if args.input_mlir:
         exit()

From 79a094f39ae71b4654d29de376a740a4435393df Mon Sep 17 00:00:00 2001
From: IanNod <45800100+IanNod@users.noreply.github.com>
Date: Mon, 1 Jul 2024 16:16:45 -0700
Subject: [PATCH 156/174] Minor bug fix in batching

Fixed typo for sdxl_prompt_encoder arg
---
 .../custom_models/sdxl_inference/sdxl_compiled_pipeline.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
index 87c460849..ec88c525d 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
@@ -494,7 +494,7 @@ def export_submodel(
                     input_mlir=input_mlir["prompt_encoder"],
                     attn_spec=self.attn_spec,
                     weights_only=weights_only,
-                    batch_size=self.batch_size,
+                    batchsize=self.batch_size,
                     batch_input=self.batch_prompt_input,
                 )
                 return prompt_encoder_vmfb, prompt_encoder_external_weight_path

From d534cd4f1fbe9003b6435023cc774a6a1697477f Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Wed, 10 Jul 2024 03:39:56 -0500
Subject: [PATCH 157/174] Consolidates SD pipelines and adds support for
 sharktank unet. (#766)

---
 .github/workflows/test_shark.yml              |    2 +-
 models/requirements.txt                       |    4 +-
 .../custom_models/pipeline_base.py            |  386 +++++--
 .../custom_models/sd3_inference/sd3_mmdit.py  |   20 +-
 .../sd3_inference/sd3_schedulers.py           |   30 +-
 .../sd3_inference/sd3_text_encoders.py        |   23 +-
 .../sd3_inference/sd3_vae_runner.py           |    4 +-
 .../custom_models/sd_inference/clip.py        |  116 +-
 .../custom_models/sd_inference/clip_runner.py |    2 +-
 .../custom_models/sd_inference/schedulers.py  |   76 +-
 .../custom_models/sd_inference/sd_cmd_opts.py |   34 +-
 .../custom_models/sd_inference/sd_pipeline.py | 1024 +++++++++--------
 .../sd_inference/tokenization.py              |  541 +++------
 .../custom_models/sd_inference/unet.py        |  151 +--
 .../custom_models/sd_inference/unet_runner.py |   51 +-
 .../custom_models/sd_inference/utils.py       |   82 +-
 .../custom_models/sd_inference/vae.py         |  165 ++-
 .../custom_models/sd_inference/vae_runner.py  |   12 +-
 .../sdxl_inference/sdxl_prompt_encoder.py     |   43 +-
 .../custom_models/sdxl_inference/unet.py      |  236 +++-
 .../sdxl_inference/unet_runner.py             |    4 +-
 .../sdxl_inference/vae_runner.py              |   50 +-
 models/turbine_models/tests/conftest.py       |    4 +-
 models/turbine_models/tests/pipeline_test.py  |    8 +-
 models/turbine_models/tests/sd3_test.py       |  229 +---
 models/turbine_models/tests/sd_test.py        |  292 ++---
 models/turbine_models/tests/sdxl_test.py      |  272 ++---
 27 files changed, 1939 insertions(+), 1922 deletions(-)

diff --git a/.github/workflows/test_shark.yml b/.github/workflows/test_shark.yml
index 301376a47..6f2e4b4ed 100644
--- a/.github/workflows/test_shark.yml
+++ b/.github/workflows/test_shark.yml
@@ -20,7 +20,7 @@ jobs:
     strategy:
       matrix:
         version: [3.11]
-        os: [nodai-ubuntu-builder-large]
+        os: [nodai-amdgpu-mi250-x86-64]
 
     runs-on: ${{matrix.os}}
     steps:
diff --git a/models/requirements.txt b/models/requirements.txt
index bdd1892e8..b7b7d8d2b 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -1,5 +1,5 @@
 protobuf
-shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
+gguf
 transformers==4.37.1
 torchsde
 accelerate
@@ -12,3 +12,5 @@ azure-storage-blob
 einops
 pytest
 scipy
+shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
+sharktank @ git+https://github.com/nod-ai/sharktank@main#subdirectory=sharktank
diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
index 24973e548..d46e20b84 100644
--- a/models/turbine_models/custom_models/pipeline_base.py
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -6,6 +6,8 @@
 
 import logging
 import torch
+import ast
+from collections.abc import Iterable
 
 import iree.runtime as ireert
 from turbine_models.custom_models.sd_inference import utils, schedulers
@@ -23,10 +25,25 @@
 import copy
 from datetime import datetime as dt
 
+np_dtypes = {
+    "fp16": np.float16,
+    "fp32": np.float32,
+    "float16": np.float16,
+    "float32": np.float32,
+}
+torch_dtypes = {
+    "fp16": torch.float16,
+    "fp32": torch.float32,
+    "float16": torch.float16,
+    "float32": torch.float32,
+}
+
 
 def merge_arg_into_map(model_map, arg, arg_name):
     if isinstance(arg, dict):
         for key in arg.keys():
+            if key not in model_map.keys():
+                continue
             if not model_map[key].get(arg_name):
                 model_map[key][arg_name] = arg[key]
     else:
@@ -36,6 +53,28 @@ def merge_arg_into_map(model_map, arg, arg_name):
     return model_map
 
 
+def merge_export_arg(model_map, arg, arg_name):
+    if isinstance(arg, dict):
+        for key in arg.keys():
+            if key not in model_map.keys():
+                continue
+            if arg_name not in model_map[key].get("export_args", {}):
+                model_map[key]["export_args"][arg_name] = arg[key]
+    else:
+        for key in model_map.keys():
+            if not model_map[key].get("export_args", {}).get(arg_name):
+                continue
+            model_map[key]["export_args"][arg_name] = arg
+    return model_map
+
+
+# def str_to_list(string):
+#     out = string.strip("[]").replace(" ", "").split(";")
+#     for item in out:
+#         item = ast.literal_eval(item)
+#     return out
+
+
 class PipelineComponent:
     """
     Wraps a VMFB runner with attributes for embedded metadata, device info, utilities and
@@ -44,14 +83,14 @@ class PipelineComponent:
     This aims to make new pipelines and execution modes easier to write, manage, and debug.
     """
 
-    def __init__(self, dest_type=ireert.DeviceArray, dest_dtype="float16"):
+    def __init__(self, dest_type="devicearray", dest_dtype="float16"):
         self.runner = None
         self.module_name = None
         self.device = None
         self.metadata = None
         self.benchmark = False
-        self.output_type = dest_type
-        self.output_dtype = dest_dtype
+        self.dest_type = dest_type
+        self.dest_dtype = dest_dtype
 
     def load(
         self,
@@ -62,25 +101,120 @@ def load(
         extra_plugin=None,
     ):
         self.module_name = module_name
+        print(
+            f"Loading {module_name} from {vmfb_path} with external weights: {external_weight_path}."
+        )
         self.runner = vmfbRunner(
             rt_device, vmfb_path, external_weight_path, extra_plugin
         )
         self.device = self.runner.config.device
         self.module = getattr(self.runner.ctx.modules, module_name)
-        self.metadata = None
+        self.get_metadata()
 
     def unload(self):
         self.device = None
         self.runner = None
         gc.collect()
 
-    def get_metadata(self, function_name):
-        if not self.metadata:
-            self.metadata = self.module[function_name].vm_function.reflection
-        return self.metadata
+    def get_metadata(self):
+        self.metadata = {}
+        for function_name in self.module.vm_module.function_names:
+            if any(x in function_name for x in ["$async", "__init"]):
+                continue
+            try:
+                self.metadata[function_name] = self.module[
+                    function_name
+                ].vm_function.reflection
+            except:
+                logging.warning(
+                    f"Could not get metadata for {self.module_name}['{function_name}']."
+                )
+                self.metadata[function_name] = None
+
+    def _validate_or_convert_inputs(self, function_name, inputs):
+        if self.metadata:
+            expected_input_shapes = self.metadata.get(function_name, {}).get(
+                "input_shapes"
+            )
+            if expected_input_shapes:
+                expected_input_shapes = ast.literal_eval(expected_input_shapes)
+            expected_input_dtypes = self.metadata.get(function_name, {}).get(
+                "input_dtypes", ""
+            )
+            if expected_input_dtypes:
+                expected_input_dtypes = ast.literal_eval(expected_input_dtypes)
+            if not isinstance(expected_input_shapes, list):
+                expected_input_shapes = [expected_input_shapes]
+            if not expected_input_dtypes:
+                pass
+            if not expected_input_shapes:
+                logging.warning(
+                    f"No input shapes found for {self.module_name}['{function_name}']."
+                )
+                for i in inputs:
+                    if not isinstance(i, ireert.DeviceArray):
+                        i = ireert.asdevicearray(self.device, i)
+                pass
+            for i, input_dtype in enumerate(expected_input_dtypes):
+                if not isinstance(inputs[i], ireert.DeviceArray):
+                    if isinstance(inputs[i], torch.Tensor) or isinstance(
+                        inputs[i], torch.HalfTensor
+                    ):
+                        new_input = inputs[i].float().cpu().numpy()
+                    else:
+                        new_input = inputs[i]
+
+                    inputs[i] = ireert.asdevicearray(
+                        self.device, new_input, input_dtype
+                    )
+                if str(inputs[i].dtype).split(".")[-1] != input_dtype:
+                    logging.warning(
+                        f"Converting input {i} to {input_dtype} for {self.module_name}['{function_name}']."
+                    )
+                    inputs[i] = inputs[i].astype(input_dtype)
+            for i, input_shape in enumerate(expected_input_shapes):
+                if isinstance(input_shape, str):
+                    input_shape = ast.literal_eval(input_shape)
+                elif not input_shape:
+                    continue
+                if tuple(inputs[i].shape) != tuple(input_shape):
+                    raise ValueError(
+                        f"Expected input {i} to be of shape {input_shape} for {self.module_name}['{function_name}'], got {str(tuple(inputs[i].shape))}."
+                    )
+        else:
+            logging.warning(
+                f"No metadata found for {self.module_name}['{function_name}']."
+            )
+            for idx, i in enumerate(inputs):
+                if not isinstance(i, ireert.DeviceArray):
+                    inputs[idx] = ireert.asdevicearray(self.device, i)
+
+    def _output_cast(self, output):
+        if isinstance(output, tuple):
+            out_tuple = ()
+            for array in output:
+                array_out = self._output_cast(array)
+                out_tuple += (array_out,)
+            return out_tuple
+        match self.dest_type:
+            case "devicearray":
+                output = (
+                    output.astype(self.dest_dtype)
+                    if output.dtype != self.dest_dtype
+                    else output
+                )
+                return output
+            case "torch":
+                output = torch.tensor(
+                    output.to_host(), dtype=torch_dtypes[self.dest_dtype]
+                )
+                return output
+            case "numpy":
+                return output.to_host().astype(np_dtypes[self.dest_dtype])
+            case _:
+                return output
 
     def _run(self, function_name, inputs: list):
-        print(inputs)
         return self.module[function_name](*inputs)
 
     def _run_and_benchmark(self, function_name, inputs: list):
@@ -92,26 +226,15 @@ def _run_and_benchmark(self, function_name, inputs: list):
 
     def __call__(self, function_name, inputs: list):
         casted_output = False
+        self._validate_or_convert_inputs(function_name, inputs)
         if not isinstance(inputs, list):
             inputs = [inputs]
         if self.benchmark:
             output = self._run_and_benchmark(function_name, inputs)
         else:
             output = self._run(function_name, inputs)
-        if output.dtype != self.output_dtype:
-            casted_output = True
-            output = output.astype(self.output_dtype)
-        match self.output_type:
-            case ireert.DeviceArray:
-                if casted_output:
-                    output = ireert.asdevicearray(
-                        self.device, output, self.output_dtype
-                    )
-                return output
-            case torch.Tensor:
-                return torch.tensor(output.to_host())
-            case np.ndarray:
-                return output.to_host()
+        output = self._output_cast(output)
+        return output
 
 
 class TurbinePipelineBase:
@@ -155,7 +278,7 @@ class TurbinePipelineBase:
     device: str | dict[str]
         Either a string i.e. "rocm://0", or a dictionary of such with keys matching the submodels of a given pipeline.
         If a string, a dictionary will be created based on the pipeline's model map and the same device will be used for all submodels.
-    iree_target_triple: str | dict[str]
+    target: str | dict[str]
         Either a string i.e. "gfx1100", or a dictionary with keys matching the submodels of a given pipeline.
     ireec_flags: str | dict[str]
         A comma-separated string of flags to pass to the IREE compiler, or a dict of them with keys matching submodels of a given pipeline.
@@ -164,9 +287,8 @@ class TurbinePipelineBase:
     def __init__(
         self,
         model_map: dict,
-        batch_size: int,
         device: str | dict[str],
-        iree_target_triple: str | dict[str],
+        target: str | dict[str],
         ireec_flags: str | dict[str] = None,
         precision: str | dict[str] = "fp16",
         td_spec: str | dict[str] = None,
@@ -174,54 +296,73 @@ def __init__(
         external_weights: str | dict[str] = None,
         pipeline_dir: str = "./shark_vmfbs",
         external_weights_dir: str = "./shark_weights",
+        hf_model_name: str | dict[str] = None,
+        common_export_args: dict = {},
     ):
         self.map = model_map
-        self.batch_size = batch_size
         if isinstance(device, dict):
             assert isinstance(
-                iree_target_triple, dict
+                target, dict
             ), "Device and target triple must be both dicts or both strings."
             for submodel in self.map.keys():
                 assert submodel in device.keys(), f"Device for {submodel} not found."
                 assert (
-                    submodel in iree_target_triple.keys()
+                    submodel in target.keys()
                 ), f"Target arch for {submodel} not found."
-                self.map[submodel]["device"] = device[submodel]
+                self.map[submodel]["device"] = utils.iree_backend_map(device[submodel])
                 self.map[submodel]["driver"] = utils.iree_device_map(device[submodel])
-                self.map[submodel]["target"] = iree_target_triple[submodel]
+                self.map[submodel]["target"] = target[submodel]
         else:
             assert isinstance(
-                iree_target_triple, str
+                target, str
             ), "Device and target triple must be both dicts or both strings."
             for submodel in self.map.keys():
-                self.map[submodel]["device"] = device
+                self.map[submodel]["device"] = utils.iree_backend_map(device)
                 self.map[submodel]["driver"] = utils.iree_device_map(device)
-                self.map[submodel]["target"] = iree_target_triple
+                self.map[submodel]["target"] = target
+
         map_arguments = {
             "ireec_flags": ireec_flags,
             "precision": precision,
             "td_spec": td_spec,
             "decomp_attn": decomp_attn,
             "external_weights": external_weights,
+            "hf_model_name": hf_model_name,
         }
         for arg in map_arguments.keys():
             self.map = merge_arg_into_map(self.map, map_arguments[arg], arg)
-        np_dtypes = {
-            "fp16": np.float16,
-            "fp32": np.float32,
-        }
-        torch_dtypes = {
-            "fp16": torch.float16,
-            "fp32": torch.float32,
-        }
+
+        self.map = merge_arg_into_map(
+            self.map, np_dtypes[self.map[submodel]["precision"]], "np_dtype"
+        )
+        self.map = merge_arg_into_map(
+            self.map, torch_dtypes[self.map[submodel]["precision"]], "torch_dtype"
+        )
+        for arg in common_export_args.keys():
+            for submodel in self.map.keys():
+                self.map[submodel].get("export_args", {})[arg] = self.map[submodel].get(
+                    arg, common_export_args[arg]
+                )
         for submodel in self.map.keys():
-            self.map = merge_arg_into_map(
-                self.map, np_dtypes[self.map[submodel]["precision"]], "np_dtype"
-            )
-            self.map = merge_arg_into_map(
-                self.map, torch_dtypes[self.map[submodel]["precision"]], "torch_dtype"
-            )
-        print(self.map)
+            for key, value in map_arguments.items():
+                self.map = merge_export_arg(self.map, value, key)
+            for key, value in self.map[submodel].get("export_args", {}).items():
+                if key == "hf_model_name":
+                    self.map[submodel]["keywords"].append(
+                        utils.create_safe_name(value.split("/")[-1], "")
+                    )
+                if key == "decomp_attn":
+                    if not value:
+                        self.map[submodel]["keywords"].append("!decomp_attn")
+                    else:
+                        self.map[submodel]["keywords"].append("decomp_attn")
+                elif key == "batch_size":
+                    self.map[submodel]["keywords"].append(f"bs{value}")
+                elif key in ["height"]:
+                    dims = f"{self.map[submodel]['export_args']['width']}x{self.map[submodel]['export_args']['height']}"
+                    self.map[submodel]["keywords"].append(dims)
+                elif key in ["max_length", "precision"]:
+                    self.map[submodel]["keywords"].append(str(value))
 
         self.pipeline_dir = pipeline_dir
         if not os.path.exists(self.pipeline_dir):
@@ -266,11 +407,17 @@ def prepare_all(
                 for submodel in self.map.keys():
                     if not self.map[submodel].get("vmfb"):
                         print("Fetching: ", submodel)
-                        self.export_submodel(submodel, input_mlir=mlirs)
-                        if not self.map[submodel]["external_weights"]:
+                        self.export_submodel(
+                            submodel, input_mlir=self.map[submodel].get("mlir")
+                        )
+                        if not self.map[submodel]["export_args"]["external_weights"]:
                             assert not self.map[submodel].get(
                                 "weights"
                             ), f"External weights should not be used for a model with inlined params."
+                    if not self.map[submodel].get("weights") and self.map[submodel][
+                        "export_args"
+                    ].get("external_weights"):
+                        self.export_submodel(submodel, weights_only=True)
                 return self.prepare_all(mlirs, vmfbs, weights, interactive)
 
     def is_prepared(self, vmfbs, weights):
@@ -288,20 +435,37 @@ def is_prepared(self, vmfbs, weights):
                 continue
             # search self.pipeline_dir for key-specific vmfb
             keywords = self.map[key].get("keywords", [])
+            mlir_keywords = copy.deepcopy(keywords)
+            mlir_keywords.extend(
+                [
+                    "mlir",
+                ]
+            )
             keywords.extend(
                 [
-                    self.map[key]["safe_name"],
                     "vmfb",
-                    "bs" + str(self.batch_size),
                     self.map[key]["target"],
-                    self.map[key]["precision"],
                 ]
             )
+            neg_keywords = []
+            for kw in keywords:
+                if kw.startswith("!"):
+                    neg_keywords.append(kw.strip("!"))
+                    keywords.remove(kw)
+                    mlir_keywords.remove(kw)
             avail_files = os.listdir(pipeline_dir)
             candidates = []
+            # print("MLIR KEYS: ", mlir_keywords)
+            # print("AVAILABLE FILES: ", avail_files)
             for filename in avail_files:
-                if all(str(x) in filename for x in keywords):
+                if all(str(x) in filename for x in keywords) and not any(
+                    x in filename for x in neg_keywords
+                ):
                     candidates.append(os.path.join(pipeline_dir, filename))
+                if all(str(x) in filename for x in mlir_keywords) and not any(
+                    x in filename for x in neg_keywords
+                ):
+                    self.map[key]["mlir"] = os.path.join(pipeline_dir, filename)
             if len(candidates) == 1:
                 self.map[key]["vmfb"] = candidates[0]
             elif len(candidates) > 1:
@@ -313,8 +477,8 @@ def is_prepared(self, vmfbs, weights):
                 missing[key].append("vmfb")
 
             # Make sure vmfb needs external weights, as they may be inlined.
-            if self.map[key].get("external_weights"):
-                if self.map[key]["external_weights"]:
+            if self.map[key].get("export_args", {}).get("external_weights"):
+                if not self.map[key]["external_weights"]:
                     continue
                 if self.map[key].get("weights"):
                     # weights already found in model map
@@ -325,10 +489,9 @@ def is_prepared(self, vmfbs, weights):
                     continue
                 # search self.external_weights_dir for key-specific weights
                 w_keywords = [
-                    self.map[key]["safe_name"],
-                    self.map[key]["precision"],
-                    self.map[key]["external_weights"],
+                    self.map[key]["export_args"]["external_weight_path"],
                 ]
+
                 avail_files = os.listdir(self.external_weights_dir)
                 candidates = []
                 for filename in avail_files:
@@ -338,17 +501,20 @@ def is_prepared(self, vmfbs, weights):
                         )
                 if len(candidates) == 1:
                     self.map[key]["weights"] = candidates[0]
+                    self.map[key]["export_args"]["external_weight_path"] = None
                 elif len(candidates) > 1:
                     print(f"Multiple weight files found for {key}: {candidates}")
                     print(f"Choosing {candidates[0]} for {key}.")
                     self.map[key][weights] = candidates[0]
-                else:
+                    self.map[key]["export_args"]["external_weight_path"] = None
+                elif self.map[key].get("external_weights"):
                     # weights not found in external_weights_dir. Add to list of files to generate.
                     missing[key].append("weights")
         if not any(x for x in missing.values()):
             ready = True
         else:
             print("Missing files: ", missing)
+            ready = False
         return ready
 
     def get_mlir_from_turbine_tank(self, submodel, container_name):
@@ -379,9 +545,12 @@ def export_submodel(
             if not os.path.exists(self.external_weights_dir):
                 os.makedirs(self.external_weights_dir, exist_ok=False)
 
-            self.map[submodel]["weights"] = os.path.join(
+            self.map[submodel]["export_args"]["external_weight_path"] = os.path.join(
                 self.external_weights_dir,
-                f"{submodel}_{self.map[submodel]['precision']}."
+                utils.create_safe_name(
+                    self.map[submodel]["export_args"].get("hf_model_name", ""), ""
+                )
+                + f"_{submodel}_{self.map[submodel]['precision']}."
                 + self.map[submodel]["external_weights"],
             )
 
@@ -404,31 +573,43 @@ def export_submodel(
                 input_mlir = None
         else:
             input_mlir = None
-        self.map[submodel]["mlir"] = input_mlir
+        self.map[submodel]["export_args"]["input_mlir"] = self.map[submodel].get(
+            "mlir", input_mlir
+        )
 
         match submodel:
             case "unetloop":  # SDXL ONLY FOR NOW
                 pipeline_file = get_pipeline_ir(
-                    self.width,
-                    self.height,
-                    self.precision,
-                    self.batch_size,
-                    self.max_length,
+                    self.map[submodel]["export_args"]["width"],
+                    self.map[submodel]["export_args"]["height"],
+                    self.map[submodel]["export_args"]["precision"],
+                    self.map[submodel]["export_args"]["batch_size"],
+                    self.map[submodel]["export_args"]["max_length"],
                     "unet_loop",
                 )
+                dims = [
+                    self.map[submodel]["export_args"]["width"],
+                    self.map[submodel]["export_args"]["height"],
+                ]
+                dims = "x".join([str(x) for x in dims])
                 pipeline_keys = [
-                    utils.create_safe_name(self.hf_model_name.split("/")[-1], ""),
-                    "bs" + str(self.batch_size),
-                    f"{str(self.width)}x{str(self.height)}",
-                    self.precision,
-                    str(self.max_length),
+                    utils.create_safe_name(
+                        self.map[submodel]["export_args"]["hf_model_name"].split("/")[
+                            -1
+                        ],
+                        "",
+                    ),
+                    "bs" + str(self.map[submodel]["export_args"]["batch_size"]),
+                    dims,
+                    self.map[submodel]["export_args"]["precision"],
+                    str(self.map[submodel]["export_args"]["max_length"]),
                     "unetloop",
                 ]
                 vmfb_path = utils.compile_to_vmfb(
                     pipeline_file,
                     self.map["unet"]["device"],
                     self.map["unet"]["target"],
-                    self.ireec_flags["pipeline"],
+                    None,
                     os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
                     return_path=True,
                     mlir_source="str",
@@ -437,26 +618,36 @@ def export_submodel(
                 self.map[submodel]["weights"] = None
             case "fullpipeline":  # SDXL ONLY FOR NOW
                 pipeline_file = get_pipeline_ir(
-                    self.width,
-                    self.height,
-                    self.precision,
-                    self.batch_size,
-                    self.max_length,
+                    self.map[submodel]["export_args"]["width"],
+                    self.map[submodel]["export_args"]["height"],
+                    self.map[submodel]["export_args"]["precision"],
+                    self.map[submodel]["export_args"]["batch_size"],
+                    self.map[submodel]["export_args"]["max_length"],
                     "tokens_to_image",
                 )
+                dims = [
+                    self.map[submodel]["export_args"]["width"],
+                    self.map[submodel]["export_args"]["height"],
+                ]
+                dims = "x".join([str(x) for x in dims])
                 pipeline_keys = [
-                    utils.create_safe_name(self.hf_model_name.split("/")[-1], ""),
-                    "bs" + str(self.batch_size),
-                    f"{str(self.width)}x{str(self.height)}",
-                    self.precision,
-                    str(self.max_length),
+                    utils.create_safe_name(
+                        self.map[submodel]["export_args"]["hf_model_name"].split("/")[
+                            -1
+                        ],
+                        "",
+                    ),
+                    "bs" + str(self.map[submodel]["export_args"]["batch_size"]),
+                    dims,
+                    self.map[submodel]["export_args"]["precision"],
+                    str(self.map[submodel]["export_args"]["max_length"]),
                     "fullpipeline",
                 ]
                 vmfb_path = utils.compile_to_vmfb(
                     pipeline_file,
                     self.map["unet"]["device"],
                     self.map["unet"]["target"],
-                    self.ireec_flags["pipeline"],
+                    None,
                     os.path.join(self.pipeline_dir, "_".join(pipeline_keys)),
                     return_path=True,
                     mlir_source="str",
@@ -465,16 +656,28 @@ def export_submodel(
                 self.map[submodel]["weights"] = None
             case _:
                 export_args = self.map[submodel].get("export_args", {})
-                if self.map[submodel].get("input_mlir"):
-                    export_args["input_mlir"] = self.map[submodel].get("mlir")
+                if weights_only:
+                    export_args["weights_only"] = True
+                    export_args["input_mlir"] = None
                 if export_args:
-                    vmfb_path = self.map[submodel]["export_fn"](**export_args)
+                    exported = self.map[submodel]["export_fn"](**export_args)
                 else:
-                    vmfb_path = self.map[submodel]["export_fn"]()
+                    exported = self.map[submodel]["export_fn"]()
+                if not self.map[submodel].get("weights") and self.map[submodel][
+                    "export_args"
+                ].get("external_weights", None):
+                    self.map[submodel]["weights"] = self.map[submodel][
+                        "export_args"
+                    ].get("external_weight_path", None)
+                if not weights_only:
+                    self.map[submodel]["vmfb"] = exported
 
     # LOAD
     def load_map(self):
         for submodel in self.map.keys():
+            if not self.map[submodel]["load"]:
+                print("Skipping load for ", submodel)
+                continue
             self.load_submodel(submodel)
 
     def load_submodel(self, submodel):
@@ -484,7 +687,8 @@ def load_submodel(self, submodel):
             "external_weights"
         ):
             raise ValueError(f"Weights not found for {submodel}.")
-        self.map[submodel]["runner"] = PipelineComponent()
+        dest_type = self.map[submodel].get("dest_type", "devicearray")
+        self.map[submodel]["runner"] = PipelineComponent(dest_type=dest_type)
         self.map[submodel]["runner"].load(
             self.map[submodel]["driver"],
             self.map[submodel]["vmfb"],
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index 8b3176c8d..d87ff5993 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -17,6 +17,7 @@
 from shark_turbine.dynamo.passes import (
     DEFAULT_DECOMPOSITIONS,
 )
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 from turbine_models.custom_models.sd_inference import utils
 import torch
 import torch._dynamo as dynamo
@@ -160,6 +161,7 @@ def export_mmdit_model(
     weights_only=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
+    np_dtype = "float16" if precision == "fp16" else "float32"
     safe_name = utils.create_safe_name(
         hf_model_name,
         f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_mmdit",
@@ -239,8 +241,22 @@ class CompiledMmdit(CompiledModule):
 
         inst = CompiledMmdit(context=Context(), import_to="IMPORT")
 
-        module_str = str(CompiledModule.get_mlir_module(inst))
-
+        module = CompiledModule.get_mlir_module(inst)
+
+    model_metadata_run_forward = {
+        "model_name": "sd3_mmdit",
+        "input_shapes": [
+            hidden_states_shape,
+            encoder_hidden_states_shape,
+            pooled_projections_shape,
+            init_batch_dim,
+        ],
+        "input_dtypes": [np_dtype for x in range(4)],
+        "output_shapes": [hidden_states_shape],
+        "output_dtypes": [np_dtype],
+    }
+    module = AddMetadataPass(module, model_metadata_run_forward, "run_forward").run()
+    module_str = str(module)
     if compile_to != "vmfb":
         return module_str
     else:
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
index ea0213486..2c1d04cf1 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -12,6 +12,7 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 from shark_turbine.aot import *
 import shark_turbine.ops.iree as ops
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 from iree.compiler.ir import Context
 import iree.runtime as ireert
 import numpy as np
@@ -213,6 +214,7 @@ def export_scheduler_model(
     upload_ir=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
+    np_dtype = "float16" if precision == "fp16" else "float32"
     scheduler_module = FlowSchedulingModel(hf_model_name, num_inference_steps, dtype)
     vmfb_names = [
         "EulerFlowScheduler",
@@ -317,8 +319,34 @@ class CompiledScheduler(CompiledModule):
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
     inst = CompiledScheduler(context=Context(), import_to=import_to)
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
+    module = CompiledModule.get_mlir_module(inst)
 
+    model_metadata_run_init = {
+        "model_name": "sd3_scheduler_FlowEulerDiscrete",
+        "input_shapes": [sample],
+        "input_dtypes": [np_dtype],
+        "output_shapes": [sample, "?", "?"],
+        "output_dtypes": [np_dtype, "int32", "float32"],
+    }
+    model_metadata_run_prep = {
+        "model_name": "sd3_scheduler_FlowEulerDiscrete",
+        "input_shapes": [sample, 1, [19]],
+        "input_dtypes": [np_dtype, "float32", "float32"],
+        "output_shapes": [noise_pred_shape, noise_pred_shape[0]],
+        "output_dtypes": [np_dtype, "float32"],
+    }
+    model_metadata_run_step = {
+        "model_name": "sd3_scheduler_FlowEulerDiscrete",
+        "input_shapes": [noise_pred_shape, 1, sample, 1, 1],
+        "input_dtypes": [np_dtype, np_dtype, np_dtype, np_dtype, "int64"],
+        "output_shapes": [sample],
+        "output_dtypes": [np_dtype],
+    }
+    module = AddMetadataPass(module, model_metadata_run_init, "run_init").run()
+    module = AddMetadataPass(module, model_metadata_run_prep, "run_prep").run()
+    module = AddMetadataPass(module, model_metadata_run_step, "run_step").run()
+
+    module_str = str(module)
     if compile_to != "vmfb":
         return module_str
     elif compile_to == "vmfb":
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
index 2e0a69445..33107aa9f 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -13,6 +13,7 @@
 from iree.compiler.ir import Context
 import numpy as np
 from shark_turbine.aot import *
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 from turbine_models.custom_models.sd_inference import utils
 import torch
 from turbine_models.custom_models.sd3_inference.text_encoder_impls import (
@@ -113,8 +114,8 @@ def forward(self, tokens_g, tokens_l, tokens_t5xxl, neg_g, neg_l, neg_t5):
 @torch.no_grad()
 def export_text_encoders(
     hf_model_name,
-    hf_auth_token=None,
     max_length=64,
+    batch_size=1,
     precision="fp16",
     compile_to="torch",
     external_weights=None,
@@ -126,7 +127,6 @@ def export_text_encoders(
     pipeline_dir=None,
     input_mlir=None,
     attn_spec=None,
-    output_batchsize=1,
     decomp_attn=True,
 ):
 
@@ -192,8 +192,20 @@ class CompiledTextEncoder(CompiledModule):
 
         inst = CompiledTextEncoder(context=Context(), import_to="IMPORT")
 
-        module_str = str(CompiledModule.get_mlir_module(inst))
-
+        module = CompiledModule.get_mlir_module(inst)
+
+    model_metadata_forward = {
+        "model_name": "sd3_clip_t5xxl_text_encoders",
+        "input_shapes": [(1, max_length, 2) for x in range(6)],
+        "input_dtypes": ["int64" for x in range(6)],
+        "output_shapes": [
+            (2 * output_batchsize, max_length * 2, 4096),
+            (2 * output_batchsize, 2048),
+        ],
+        "output_dtypes": ["float32"],
+    }
+    module = AddMetadataPass(module, model_metadata_forward, "forward").run()
+    module_str = str(module)
     if compile_to != "vmfb":
         return module_str
     else:
@@ -215,8 +227,8 @@ class CompiledTextEncoder(CompiledModule):
 
     mod_str, _ = export_text_encoders(
         args.hf_model_name,
-        args.hf_auth_token,
         args.max_length,
+        args.batch_size,
         args.precision,
         args.compile_to,
         args.external_weights,
@@ -228,7 +240,6 @@ class CompiledTextEncoder(CompiledModule):
         pipeline_dir=args.pipeline_dir,
         input_mlir=args.input_mlir,
         attn_spec=args.attn_spec,
-        output_batchsize=args.batch_size,
     )
     if args.input_mlir or args.weights_only or args.compile_to == "vmfb":
         exit()
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
index 1267bb862..521f90bb9 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
@@ -21,9 +21,9 @@ def run_vae(
 
 
 def run_torch_vae(hf_model_name, variant, example_input):
-    from turbine_models.custom_models.sd3_inference.sd3_vae import VaeModel
+    from turbine_models.custom_models.sd_inference.vae import SD3VaeModel
 
-    vae_model = VaeModel(
+    vae_model = SD3VaeModel(
         hf_model_name,
     )
 
diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index 52c36a5c3..11705a916 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -9,78 +9,50 @@
 
 from iree.compiler.ir import Context
 from shark_turbine.aot import *
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 from turbine_models.custom_models.sd_inference import utils
 import torch
 from transformers import CLIPTextModel, CLIPTokenizer, CLIPProcessor
 from turbine_models.turbine_tank import turbine_tank
 
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--hf_auth_token", type=str, help="The Hugging Face auth token, required"
-)
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="CompVis/stable-diffusion-v1-4",
-)
-parser.add_argument("--compile_to", type=str, help="torch, linalg, vmfb")
-parser.add_argument("--external_weight_path", type=str, default="")
-parser.add_argument(
-    "--external_weights",
-    type=str,
-    default=None,
-    help="saves ir/vmfb without global weights for size and readability, options [safetensors]",
-)
-parser.add_argument("--device", type=str, default="cpu", help="cpu, cuda, vulkan, rocm")
-# TODO: Bring in detection for target triple
-parser.add_argument(
-    "--iree_target_triple",
-    type=str,
-    default="",
-    help="Specify vulkan target triple or rocm/cuda target device.",
-)
-parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-
 
+@torch.no_grad()
 def export_clip_model(
     hf_model_name,
-    hf_auth_token: str = None,
+    batch_size: int = 1,
     max_length: int = 64,
     precision: str = "fp16",
     compile_to: str = "torch",
     external_weights: str = None,
     external_weight_path: str = None,
     device: str = "llvm-cpu",
-    target_triple: str = "x86_64-linux-gnu",
+    target: str = "x86_64-linux-gnu",
     ireec_flags: str = None,
     exit_on_vmfb: bool = False,
     pipeline_dir: str = None,
     input_mlir: str = None,
-    td_spec: str = None,
+    attn_spec: str = None,
     weights_only: bool = False,
     upload_ir: bool = False,
+    decomp_attn: bool = False,
 ):
     input_len = max_length
+    safe_name = utils.create_safe_name(
+        hf_model_name, f"_bs{batch_size}_{str(max_length)}-{precision}-clip"
+    )
     if pipeline_dir not in [None, ""]:
-        safe_name = os.path.join(pipeline_dir, "clip")
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name, f"_{str(max_length)}-{precision}-clip-{device}"
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
-            attn_spec=td_spec,
+            attn_spec=attn_spec,
         )
         return vmfb_path
     if "google/t5" in hf_model_name:
@@ -101,27 +73,26 @@ def export_clip_model(
             tokenizer = CLIPTokenizer.from_pretrained(
                 hf_model_name,
                 subfolder="tokenizer",
-                token=hf_auth_token,
             )
             hf_subfolder = "text_encoder"
 
         text_encoder_model = CLIPTextModel.from_pretrained(
             hf_model_name,
             subfolder=hf_subfolder,
-            token=hf_auth_token,
         )
-
+    if precision == "fp16":
+        text_encoder_model = text_encoder_model.half()
     mapper = {}
     utils.save_external_weights(
         mapper, text_encoder_model, external_weights, external_weight_path
     )
-
     if weights_only:
         return external_weight_path
 
     if "google/t5" in hf_model_name:
+        input_shapes = [(batch_size, input_len), (batch_size, input_len)]
 
-        class CompiledClip(CompiledModule):
+        class CompiledTextEncoder(CompiledModule):
             if external_weights:
                 params = export_parameters(
                     text_encoder_model,
@@ -132,7 +103,7 @@ class CompiledClip(CompiledModule):
             else:
                 params = export_parameters(text_encoder_model)
 
-            def main(
+            def encode_tokens(
                 self,
                 inp=AbstractTensor(1, input_len, dtype=torch.int64),
                 decoder_input_ids=AbstractTensor(1, input_len, dtype=torch.int64),
@@ -142,8 +113,9 @@ def main(
                 )
 
     else:
+        input_shapes = [str((batch_size, input_len)), str((batch_size, input_len))]
 
-        class CompiledClip(CompiledModule):
+        class CompiledTextEncoder(CompiledModule):
             if external_weights:
                 params = export_parameters(
                     text_encoder_model,
@@ -154,31 +126,61 @@ class CompiledClip(CompiledModule):
             else:
                 params = export_parameters(text_encoder_model)
 
-            def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
+            def encode_tokens_attn_mask(
+                self,
+                inp=AbstractTensor(1, input_len, dtype=torch.int64),
+                attn_mask=AbstractTensor(1, input_len, dtype=torch.int64),
+            ):
+                return jittable(text_encoder_model.forward)(
+                    input_ids=inp, attention_mask=attn_mask
+                )
+
+            def encode_tokens(
+                self,
+                inp=AbstractTensor(1, input_len, dtype=torch.int64),
+            ):
                 return jittable(text_encoder_model.forward)(input_ids=inp)
 
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledClip(context=Context(), import_to=import_to)
-
-    module_str = str(CompiledModule.get_mlir_module(inst))
+    inst = CompiledTextEncoder(context=Context(), import_to=import_to)
+    module = CompiledModule.get_mlir_module(inst)
+
+    model_metadata_attn_mask = {
+        "model_name": hf_model_name + "_text_encoder",
+        "input_shapes": input_shapes,
+        "input_dtypes": ["int64", "int64"],
+        "use_attention_mask": True,
+    }
+    model_metadata_encode = {
+        "model_name": hf_model_name + "_text_encoder",
+        "input_shapes": input_shapes[0],
+        "input_dtypes": ["int64"],
+        "use_attention_mask": False,
+    }
+    module = AddMetadataPass(
+        module, model_metadata_attn_mask, "encode_tokens_attn_mask"
+    ).run()
+    module = AddMetadataPass(module, model_metadata_encode, "encode_tokens").run()
+
+    module_str = str(module)
     if compile_to != "vmfb":
-        return module_str, tokenizer
+        return module_str
     else:
         vmfb_path = utils.compile_to_vmfb(
             module_str,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
-            attn_spec=td_spec,
+            attn_spec=attn_spec,
         )
-        return vmfb_path, None
+        return vmfb_path
 
 
 if __name__ == "__main__":
-    from .sd_cmd_opts import args
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
 
     mod_str, _ = export_clip_model(
         args.hf_model_name,
@@ -193,7 +195,7 @@ def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
         exit_on_vmfb=True,
         pipeline_dir=args.pipeline_dir,
         input_mlir=args.input_mlir,
-        td_spec=args.attn_spec,
+        attn_spec=args.attn_spec,
         weights_only=False,
         upload_ir=False,
     )
diff --git a/models/turbine_models/custom_models/sd_inference/clip_runner.py b/models/turbine_models/custom_models/sd_inference/clip_runner.py
index fe5310ff6..da0908fad 100644
--- a/models/turbine_models/custom_models/sd_inference/clip_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/clip_runner.py
@@ -55,7 +55,7 @@ def run_clip(
 
     if "google/t5" in hf_model_name:
         inp += [ireert.asdevicearray(runner.config.device, example_input)]
-    results = runner.ctx.modules.compiled_clip["main"](*inp)
+    results = runner.ctx.modules.compiled_text_encoder["encode_tokens"](*inp)
     return results
 
 
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 2c8d618c6..1a8cd8858 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -74,6 +74,9 @@ def __init__(
         self.model = scheduler
         self.height = height
         self.width = width
+        self.is_sd3 = False
+        if "stable-diffusion-3" in hf_model_name:
+            self.is_sd3 = True
         self.batch_size = batch_size
         self.do_classifier_free_guidance = True
         self.model.set_timesteps(num_inference_steps)
@@ -129,19 +132,32 @@ def step(self, noise_pred, t, sample, guidance_scale, i):
 class SharkSchedulerCPUWrapper:
     @torch.no_grad()
     def __init__(
-        self, scheduler, batch_size, num_inference_steps, dest_device, latents_dtype
+        self,
+        scheduler,
+        batch_size,
+        dest_device,
+        latents_dtype,
+        conditional_timesteps=False,
     ):
-        self.do_classifier_free_guidance = True
         self.module = scheduler
         self.dest = dest_device
-        self.dtype = latents_dtype
         self.batch_size = batch_size
         self.timesteps = None
+        self.do_guidance = True
+        self.repeat_sample = True
+
+        # Enable this on init for models that use a pair of timestep values per unet step.
+        # this includes sd3 and some others we don't support yet.
+        # It allows passage of 'uncond_t' to the scale_model_input function and repeats the
+        # default timestep value if no 'uncond_t' is passed.
+        self.conditional_timesteps = conditional_timesteps
+
+        self.dtype = latents_dtype
         self.torch_dtype = (
             torch.float32 if latents_dtype == "float32" else torch.float16
         )
 
-    def initialize(self, sample, num_inference_steps):
+    def initialize_sdxl(self, sample, num_inference_steps):
         if isinstance(sample, ireert.DeviceArray):
             sample = torch.tensor(sample.to_host(), dtype=torch.float32)
 
@@ -154,7 +170,7 @@ def initialize(self, sample, num_inference_steps):
         crops_coords_top_left = (0, 0)
         add_time_ids = list(original_size + crops_coords_top_left + target_size)
         add_time_ids = torch.tensor([add_time_ids], dtype=self.torch_dtype)
-        if self.do_classifier_free_guidance:
+        if self.do_guidance:
             add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
             add_time_ids = add_time_ids.repeat(self.batch_size, 1).type(
                 self.torch_dtype
@@ -162,25 +178,39 @@ def initialize(self, sample, num_inference_steps):
         step_indexes = torch.tensor(len(self.timesteps))
         timesteps = self.timesteps
         sample = sample * self.module.init_noise_sigma
-        add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
         return sample, add_time_ids, step_indexes, timesteps
 
-    def scale_model_input(self, sample, t, timesteps):
-        if self.do_classifier_free_guidance:
+    def initialize_sd(self, sample, num_inference_steps):
+        if isinstance(sample, ireert.DeviceArray):
+            sample = torch.tensor(sample.to_host(), dtype=torch.float32)
+        self.module.set_timesteps(num_inference_steps)
+        timesteps = self.module.timesteps
+        sample = sample * self.module.init_noise_sigma
+        return sample, timesteps
+
+    def scale_model_input(self, sample, t, t_uncond=None):
+        if self.repeat_sample:
             sample = torch.cat([sample] * 2)
-        t = timesteps[t]
+        if self.conditional_timesteps:
+            if t_uncond:
+                t = torch.tensor([t, t_uncond])
+            else:
+                t = torch.tensor([t, t])
+        else:
+            t = torch.tensor([t])
         scaled = self.module.scale_model_input(sample, t)
-        t = ireert.asdevicearray(self.dest, [t], self.dtype)
-        scaled = ireert.asdevicearray(self.dest, scaled, self.dtype)
         return scaled, t
 
-    def step(self, noise_pred, t, latents, guidance_scale, i):
+    def step(self, noise_pred, t, latents, guidance_scale=None):
         if isinstance(t, ireert.DeviceArray):
             t = torch.tensor(t.to_host())
+        if isinstance(noise_pred, ireert.DeviceArray):
+            noise_pred = torch.tensor(noise_pred.to_host())
+        elif isinstance(noise_pred, np.ndarray):
+            noise_pred = torch.tensor(noise_pred)
         if isinstance(guidance_scale, ireert.DeviceArray):
             guidance_scale = torch.tensor(guidance_scale.to_host())
-        noise_pred = torch.tensor(noise_pred.to_host())
-        if self.do_classifier_free_guidance:
+        if self.do_guidance:
             noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
             noise_pred = noise_pred_uncond + guidance_scale * (
                 noise_pred_text - noise_pred_uncond
@@ -189,8 +219,7 @@ def step(self, noise_pred, t, latents, guidance_scale, i):
             noise_pred,
             t,
             latents,
-            return_dict=False,
-        )[0]
+        ).prev_sample
 
 
 @torch.no_grad()
@@ -204,11 +233,14 @@ def export_scheduler_model(
     precision: str = "fp16",
     compile_to: str = "torch",
     device: str = None,
-    target_triple: str = None,
+    target: str = None,
     ireec_flags: str = None,
     exit_on_vmfb: bool = False,
     pipeline_dir: str = None,
     input_mlir: str = None,
+    attn_spec: str = None,
+    external_weights: str = None,
+    external_weight_path: str = None,
     upload_ir=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
@@ -233,9 +265,9 @@ def export_scheduler_model(
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
-            target_triple,
+            target,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             mlir_source="file",
             return_path=not exit_on_vmfb,
         )
@@ -329,9 +361,9 @@ class CompiledScheduler(CompiledModule):
         vmfb = utils.compile_to_vmfb(
             module_str,
             device,
-            target_triple,
+            target,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             return_path=True,
         )
         if exit_on_vmfb:
@@ -350,6 +382,8 @@ def get_scheduler(model_id, scheduler_id):
         scheduler = DPMSolverMultistepScheduler.from_pretrained(
             model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
         )
+    else:
+        raise ValueError(f"Scheduler {scheduler_id} not found.")
     if "Karras" in scheduler_id:
         scheduler.config.use_karras_sigmas = True
 
diff --git a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
index e56737369..8c68ad06c 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
@@ -47,7 +47,7 @@ def is_valid_file(arg):
     "--scheduler_id",
     type=str,
     help="Scheduler ID",
-    default="Euler",
+    default="EulerDiscrete",
 )
 
 ##############################################################################
@@ -101,7 +101,7 @@ def is_valid_file(arg):
 p.add_argument(
     "--external_weights_dir",
     type=str,
-    default="",
+    default="./weights",
     help="Directory containing external weights for a job that requires more than one weights file. When importing, this is used to specify where to save the model weights, and at runtime, this is used to specify where to load the model weights from. Files will then be saved according to the parameters that make them unique, i.e. <hf_model_name>_<precision>_<submodel>_<submodel-specific>.<external_weights>",
 )
 
@@ -126,7 +126,7 @@ def is_valid_file(arg):
 p.add_argument(
     "--pipeline_dir",
     type=str,
-    default=None,
+    default="./vmfbs",
     help="Directory to save pipeline artifacts",
 )
 
@@ -137,6 +137,13 @@ def is_valid_file(arg):
     help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
 )
 
+p.add_argument(
+    "--cpu_scheduling",
+    default=True,
+    action="store_true",
+    help="Run scheduling on native pytorch CPU backend.",
+)
+
 ##############################################################################
 # SDXL Modelling Options
 #    These options are used to control model defining parameters for SDXL.
@@ -146,10 +153,10 @@ def is_valid_file(arg):
 
 p.add_argument("--batch_size", type=int, default=1, help="Batch size for inference")
 p.add_argument(
-    "--height", type=int, default=1024, help="Height of Stable Diffusion output image."
+    "--height", type=int, default=512, help="Height of Stable Diffusion output image."
 )
 p.add_argument(
-    "--width", type=int, default=1024, help="Width of Stable Diffusion output image"
+    "--width", type=int, default=512, help="Width of Stable Diffusion output image"
 )
 p.add_argument(
     "--precision",
@@ -169,11 +176,22 @@ def is_valid_file(arg):
 
 p.add_argument(
     "--vae_decomp_attn",
-    type=bool,
-    default=False,
+    action="store_true",
     help="Decompose attention for VAE decode only at fx graph level",
 )
 
+p.add_argument(
+    "--unet_decomp_attn",
+    action="store_true",
+    help="Decompose attention for unet only at fx graph level",
+)
+
+p.add_argument(
+    "--use_i8_punet",
+    action="store_true",
+    help="Use i8 quantized Partitioned UNet for inference",
+)
+
 ##############################################################################
 # SDXL script general options.
 ##############################################################################
@@ -244,7 +262,7 @@ def is_valid_file(arg):
 p.add_argument(
     "--iree_target_triple",
     type=str,
-    default="",
+    default="x86_64-linux-gnu",
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 3975bfbbb..868872479 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -17,7 +17,19 @@
     schedulers,
     utils,
 )
-from .tokenization import get_weighted_text_embeddings
+from turbine_models.custom_models.sdxl_inference import (
+    sdxl_prompt_encoder as sdxl_clip,
+    unet as sdxl_unet,
+)
+from turbine_models.custom_models.sd3_inference import (
+    sd3_text_encoders,
+    sd3_mmdit,
+)
+from turbine_models.custom_models.pipeline_base import (
+    TurbinePipelineBase,
+    merge_arg_into_map,
+)
+from turbine_models.custom_models.sd_inference.tokenization import encode_prompt
 from turbine_models.model_runner import vmfbRunner
 from transformers import CLIPTokenizer
 from pathlib import Path
@@ -28,419 +40,411 @@
 import time
 from datetime import datetime as dt
 
+# These are arguments common among submodel exports.
+# They are expected to be populated in two steps:
+# First, by the child class,
+# and second by the base class for inference task-agnostic args.
+
+sd1_sd2_model_map = {
+    "text_encoder": {
+        "module_name": "compiled_text_encoder",
+        "keywords": ["clip"],
+        "dest_type": "torch",
+        "export_fn": clip.export_clip_model,
+        "export_args": {
+            "batch_size": 1,
+            "max_length": 64,
+        },
+    },
+    "unet": {
+        "module_name": "compiled_unet",
+        "keywords": ["unet"],
+        "export_fn": unet.export_unet_model,
+        "export_args": {
+            "batch_size": 1,
+            "height": 512,
+            "width": 512,
+            "max_length": 64,
+            "decomp_attn": None,
+        },
+    },
+    "vae": {
+        "module_name": "compiled_vae",
+        "keywords": ["vae"],
+        "dest_type": "numpy",
+        "export_fn": vae.export_vae_model,
+        "export_args": {
+            "batch_size": 1,
+            "height": 512,
+            "width": 512,
+            "num_channels": 4,
+            "decomp_attn": None,
+        },
+    },
+}
+sdxl_model_map = {
+    "text_encoder": {
+        "module_name": "compiled_clip",
+        "keywords": ["prompt_encoder"],
+        "dest_type": "torch",
+        "export_fn": sdxl_clip.export_prompt_encoder,
+        "export_args": {
+            "batch_size": 1,
+            "max_length": 64,
+        },
+    },
+    "unet": {
+        "module_name": "compiled_unet",
+        "keywords": ["unet", "!loop"],
+        "export_fn": sdxl_unet.export_unet_model,
+        "export_args": {
+            "batch_size": 1,
+            "height": 1024,
+            "width": 1024,
+            "max_length": 64,
+            "decomp_attn": None,
+        },
+    },
+    "vae": {
+        "module_name": "compiled_vae",
+        "keywords": ["vae"],
+        "dest_type": "numpy",
+        "export_fn": vae.export_vae_model,
+        "export_args": {
+            "batch_size": 1,
+            "height": 1024,
+            "width": 1024,
+            "num_channels": 4,
+            "decomp_attn": None,
+        },
+    },
+    "unetloop": {
+        "module_name": "sdxl_compiled_pipeline",
+        "load": False,
+        "keywords": ["unetloop"],
+        "wraps": ["unet", "scheduler"],
+        "export_args": {
+            "batch_size": 1,
+            "height": 1024,
+            "width": 1024,
+            "max_length": 64,
+        },
+    },
+    "fullpipeline": {
+        "module_name": "sdxl_compiled_pipeline",
+        "load": False,
+        "keywords": ["fullpipeline"],
+        "wraps": ["text_encoder", "unet", "scheduler", "vae"],
+        "export_args": {
+            "batch_size": 1,
+            "height": 1024,
+            "width": 1024,
+            "max_length": 64,
+        },
+    },
+}
+sd3_model_map = {
+    "text_encoder": {
+        "module_name": "compiled_text_encoder",
+        "keywords": ["text_encoder"],
+        "export_fn": sd3_text_encoders.export_text_encoders,
+        "export_args": {
+            "batch_size": 1,
+            "max_length": 64,
+        },
+    },
+    "mmdit": {
+        "module_name": "compiled_mmdit",
+        "keywords": ["mmdit"],
+        "export_fn": sd3_mmdit.export_mmdit_model,
+        "export_args": {
+            "batch_size": 1,
+            "height": 1024,
+            "width": 1024,
+            "max_length": 64,
+            "decomp_attn": None,
+        },
+    },
+    "vae": {
+        "module_name": "compiled_vae",
+        "keywords": ["vae"],
+        "dest_type": "numpy",
+        "export_fn": vae.export_vae_model,
+        "export_args": {
+            "batch_size": 1,
+            "height": 1024,
+            "width": 1024,
+            "num_channels": 16,
+            "decomp_attn": None,
+        },
+    },
+}
+
+
+def get_sd_model_map(hf_model_name):
+    if isinstance(hf_model_name, dict):
+        name = hf_model_name["text_encoder"]
+    else:
+        name = hf_model_name
+    if name in ["stabilityai/sdxl-turbo", "stabilityai/stable-diffusion-xl-base-1.0"]:
+        return sdxl_model_map
+    elif "stabilityai/stable-diffusion-3" in name:
+        return sd3_model_map
+    else:
+        return sd1_sd2_model_map
 
-device_list = [
-    "cpu",
-    "vulkan",
-    "cuda",
-    "rocm",
-]
-
-rt_device_list = [
-    "local-task",
-    "local-sync",
-    "vulkan",
-    "cuda",
-    "rocm",
-    "hip",
-]
-
-SUBMODELS = {
-    "clip": None,
-    "scheduler": None,
-    "unet": None,
-    "vae_decode": None,
+
+torch_dtypes = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "int8": torch.int8,
+    "i8": torch.int8,
 }
 
 
-class SharkSDPipeline:
+class SharkSDPipeline(TurbinePipelineBase):
     def __init__(
         self,
-        hf_model_name: str,
-        scheduler_id: str,
+        hf_model_name: str | dict[str],
         height: int,
         width: int,
-        precision: str,
-        max_length: int,
         batch_size: int,
-        num_inference_steps: int,
-        device: str,
-        iree_target_triple: str,
-        ireec_flags: dict = copy.deepcopy(SUBMODELS),
-        attn_spec: str = None,
-        decomp_attn: bool = False,
-        pipeline_dir: str | Path = "./shark_vmfbs",
-        external_weights_dir: str | Path = "./shark_weights",
-        external_weights: str = "safetensors",
-        custom_vae: str = None,
-        vae_decomp_attn: bool = True,
+        max_length: int | dict[int],
+        precision: str | dict[str],
+        device: str | dict[str],
+        target: str | dict[str],
+        ireec_flags: str | dict[str] = None,
+        attn_spec: str | dict[str] = None,
+        decomp_attn: bool | dict[bool] = False,
+        pipeline_dir: str = "./shark_vmfbs",
+        external_weights_dir: str = "./shark_weights",
+        external_weights: str | dict[str] = "safetensors",
+        num_inference_steps: int = 30,
+        cpu_scheduling: bool = True,
+        scheduler_id: str = None,  # compatibility only
+        shift: float = 1.0,  # compatibility only
+        use_i8_punet: bool = False,
     ):
-        self.hf_model_name = hf_model_name
-        self.iree_dtype = "float32" if precision == "fp32" else "float16"
-        self.torch_dtype = torch.float32 if precision == "fp32" else torch.float16
-        self.cpu_scheduling = True
-        self.scheduler_id = scheduler_id
+        common_export_args = {
+            "hf_model_name": None,
+            "precision": None,
+            "compile_to": "vmfb",
+            "device": None,
+            "target": None,
+            "exit_on_vmfb": False,
+            "pipeline_dir": pipeline_dir,
+            "input_mlir": None,
+            "attn_spec": None,
+            "external_weights": None,
+            "external_weight_path": None,
+        }
+        sd_model_map = get_sd_model_map(hf_model_name)
+        for submodel in sd_model_map:
+            if "load" not in sd_model_map[submodel]:
+                sd_model_map[submodel]["load"] = True
+            sd_model_map[submodel]["export_args"]["batch_size"] = batch_size
+            if "max_length" in sd_model_map[submodel]["export_args"]:
+                max_length_sub = (
+                    max_length if isinstance(max_length, int) else max_length[submodel]
+                )
+                sd_model_map[submodel]["export_args"]["max_length"] = max_length_sub
+            if "height" in sd_model_map[submodel]["export_args"]:
+                sd_model_map[submodel]["export_args"]["height"] = height
+                sd_model_map[submodel]["export_args"]["width"] = width
+            if "decomp_attn" in sd_model_map[submodel]["export_args"]:
+                sd_model_map[submodel]["export_args"]["decomp_attn"] = decomp_attn[
+                    submodel
+                ]
+        super().__init__(
+            sd_model_map,
+            device,
+            target,
+            ireec_flags,
+            precision,
+            attn_spec,
+            decomp_attn,
+            external_weights,
+            pipeline_dir,
+            external_weights_dir,
+            hf_model_name,
+            common_export_args,
+        )
+        for submodel in sd_model_map:
+            if self.map[submodel].get("external_weights"):
+                weights_filename = utils.create_safe_name(
+                    self.map[submodel]["export_args"]["hf_model_name"],
+                    f"_{submodel}_{self.map[submodel]['precision']}",
+                )
+                weights_filename += (
+                    "." + self.map[submodel]["export_args"]["external_weights"]
+                )
+                self.map[submodel]["export_args"][
+                    "external_weight_path"
+                ] = weights_filename
+
+        self.batch_size = batch_size
+        self.model_max_length = max_length
         self.height = height
         self.width = width
-        self.precision = precision
-        self.max_length = max_length
-        self.model_max_length = max_length
-        self.batch_size = batch_size
+        self.latents_dtype = torch_dtypes[self.map["unet"]["precision"]]
+        self.cpu_scheduling = cpu_scheduling
+        self.scheduler_id = scheduler_id
         self.num_inference_steps = num_inference_steps
-        self.device = device
-        self.iree_target_triple = iree_target_triple
-        self.ireec_flags = ireec_flags if ireec_flags else copy.deepcopy(SUBMODELS)
-        self.attn_spec = attn_spec
-        self.decomp_attn = decomp_attn
-        self.pipeline_dir = pipeline_dir
-        self.external_weights_dir = external_weights_dir
-        self.external_weights = external_weights
-        self.custom_vae = custom_vae
-        self.vae_decomp_attn = vae_decomp_attn
-        self.is_sdxl = "xl" in self.hf_model_name
-
-    # FILE MANAGEMENT AND PIPELINE SETUP
-
-    def check_prepared(
-        self,
-        mlirs: dict,
-        vmfbs: dict,
-        weights: dict,
-        interactive: bool = True,
-    ):
-        ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
-        if not ready:
-            if interactive:
-                do_continue = input(
-                    f"\nIt seems you are missing some necessary files. Would you like to generate them now? (y/n)"
-                )
-                if do_continue.lower() != "y":
-                    exit()
-            else:
-                do_continue = "y"
-            if do_continue.lower() == "y":
-                for submodel in vmfbs.keys():
-                    if vmfbs[submodel] == None:
-                        vmfb, weight = self.export_submodel(submodel, input_mlir=mlirs)
-                        vmfbs[submodel] = vmfb
-                        if weights[submodel] is None:
-                            weights[submodel] = weight
-                    elif weights[submodel] is None and "scheduler" not in submodel:
-                        _, weight = self.export_submodel(submodel, weights_only=True)
-                        weights[submodel] = weight
-                ready, vmfbs, weights = self.is_prepared(vmfbs, weights)
-                if ready:
-                    print("All necessary files found.")
-                    return vmfbs, weights
-                else:
-                    print("There was an error generating the necessary files.")
-                    exit()
-        else:
-            print("All necessary files found. Loading pipeline.")
-        return vmfbs, weights
-
-    def is_prepared(self, vmfbs, weights):
-        missing = []
-        for key in vmfbs:
-            if "scheduler" in key and self.cpu_scheduling:
-                continue
-            default_filepath = os.path.join(self.pipeline_dir, key + ".vmfb")
-            if vmfbs[key] is not None and os.path.exists(vmfbs[key]):
-                continue
-            elif vmfbs[key] == None and os.path.exists(default_filepath):
-                vmfbs[key] = default_filepath
-            else:
-                missing.append(key + ".vmfb")
-        for w_key in weights:
-            if "scheduler" in w_key:
-                continue
-            if weights[w_key] is not None and os.path.exists(weights[w_key]):
-                continue
-            if self.external_weights is None:
-                weights[w_key] = None
-                continue
-            default_name = os.path.join(
-                self.external_weights_dir, w_key + "." + self.external_weights
-            )
-            if weights[w_key] is None and os.path.exists(default_name):
-                weights[w_key] = os.path.join(default_name)
-            else:
-                missing.append(w_key + "." + self.external_weights)
-        if len(missing) > 0:
-            print(f"Missing files: " + ", ".join(missing))
-            return False, vmfbs, weights
-        else:
-            return True, vmfbs, weights
 
-    def get_mlir_from_turbine_tank(self, submodel, container_name):
-        from turbine_models.turbine_tank import downloadModelArtifacts
+        self.text_encoder = None
+        self.unet = None
+        self.mmdit = None
+        self.vae = None
+        self.scheduler = None
 
-        safe_name = utils.create_safe_name(
-            self.hf_model_name,
-            f"_{self.max_length}_{self.height}x{self.width}_{self.precision}_{submodel}.mlir",
-        )
-        mlir_path = downloadModelArtifacts(
-            safe_name,
-            container_name,
-        )
-        return mlir_path
-
-    # IMPORT / COMPILE PHASE
-
-    def get_torch_models(self, submodel):
-        match submodel:
-            case "unet":
-                unet_torch = unet.UnetModel(
-                    self.hf_model_name,
-                )
-                return unet_torch
-            case "vae_decode":
-                vae_torch = vae.VaeModel(
-                    self.hf_model_name,
-                    self.custom_vae,
-                )
-                return vae_torch
+        self.split_scheduler = True
 
-    def export_submodel(
-        self,
-        submodel: str,
-        input_mlir: str = None,
-        weights_only: bool = False,
-    ):
-        if not os.path.exists(self.pipeline_dir):
-            os.makedirs(self.pipeline_dir)
-        if self.external_weights_dir:
-            if not os.path.exists(self.external_weights_dir):
-                os.makedirs(external_weights_dir, exist_ok=True)
-            vae_external_weight_path = os.path.join(
-                self.external_weights_dir, "vae_decode." + self.external_weights
-            )
-            unet_external_weight_path = os.path.join(
-                self.external_weights_dir, "unet." + self.external_weights
-            )
-            clip_external_weight_path = os.path.join(
-                self.external_weights_dir, "clip." + self.external_weights
+        self.base_model_name = (
+            hf_model_name
+            if isinstance(hf_model_name, str)
+            else hf_model_name.get("unet", hf_model_name.get("mmdit"))
+        )
+        self.is_img2img = False
+        self.is_sdxl = "xl" in self.base_model_name
+        self.is_sd3 = "stable-diffusion-3" in self.base_model_name
+        if self.is_sdxl:
+            if self.split_scheduler:
+                self.map.pop("unetloop")
+                self.map.pop("fullpipeline")
+            self.tokenizers = [
+                CLIPTokenizer.from_pretrained(
+                    self.base_model_name, subfolder="tokenizer"
+                ),
+                CLIPTokenizer.from_pretrained(
+                    self.base_model_name, subfolder="tokenizer_2"
+                ),
+            ]
+        elif not self.is_sd3:
+            self.tokenizer = CLIPTokenizer.from_pretrained(
+                self.base_model_name, subfolder="tokenizer"
             )
-        elif self.external_weights is None:
-            print(
-                "No external weights type specified using --external_weights, weights for imported .mlir files will not be externalized."
+
+        self.use_i8_punet = self.use_punet = use_i8_punet
+        if self.use_i8_punet:
+            self.map["unet"]["export_args"]["precision"] = "i8"
+            self.map["unet"]["export_args"]["use_punet"] = True
+            self.map["unet"]["keywords"].append("punet")
+            self.map["unet"]["module_name"] = "compiled_punet"
+            self.map["unet"]["function_name"] = "main"
+            self.map["unet"]["export_args"]["external_weight_path"] = (
+                utils.create_safe_name(self.base_model_name) + "_punet_dataset_i8.irpa"
             )
-            vae_external_weight_path = None
-            unet_external_weight_path = None
-            clip_external_weight_path = None
+            for idx, word in enumerate(self.map["unet"]["keywords"]):
+                if word in ["fp32", "fp16"]:
+                    self.map["unet"]["keywords"][idx] = "i8"
+                    break
         else:
-            print(
-                f"No external weights directory specified using --external_weights_dir, we assume you have your own weights in {self.pipeline_dir}."
-            )
-            external_weights_dir = self.pipeline_dir
-            if not os.path.exists(self.pipeline_dir):
-                os.makedirs(self.pipeline_dir, exist_ok=True)
-            vae_external_weight_path = os.path.join(
-                self.pipeline_dir, "vae_decode." + self.external_weights
-            )
-            unet_external_weight_path = os.path.join(
-                self.pipeline_dir, "unet." + self.external_weights
-            )
-            clip_external_weight_path = os.path.join(
-                self.pipeline_dir, "clip." + self.external_weights
-            )
-        if weights_only:
-            input_mlir = copy.deepcopy(SUBMODELS)
-        match submodel:
-            case "clip":
-                _, clip_vmfb = clip.export_clip_model(
-                    self.hf_model_name,
-                    None,
-                    self.max_length,
-                    self.precision,
-                    "vmfb",
-                    self.external_weights,
-                    clip_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
-                    self.ireec_flags["clip"],
-                    exit_on_vmfb=False,
-                    pipeline_dir=self.pipeline_dir,
-                    input_mlir=input_mlir["clip"],
-                    td_spec=self.attn_spec,
-                    weights_only=weights_only,
-                )
-                return clip_vmfb, clip_external_weight_path
-            case "scheduler":
-                if self.cpu_scheduling:
-                    return (None, None)
-                scheduler = schedulers.export_scheduler_model(
-                    self.hf_model_name,
-                    self.scheduler_id,
-                    self.batch_size,
-                    self.height,
-                    self.width,
-                    self.num_inference_steps,
-                    self.precision,
-                    "vmfb",
-                    self.device,
-                    self.iree_target_triple,
-                    self.ireec_flags["scheduler"],
-                    exit_on_vmfb=False,
-                    pipeline_dir=self.pipeline_dir,
-                    input_mlir=input_mlir["scheduler"],
-                )
-                return scheduler, None
-            case "unet":
-                if input_mlir[submodel]:
-                    unet_torch = None
-                else:
-                    unet_torch = self.get_torch_models("unet")
-
-                unet_vmfb = unet.export_unet_model(
-                    unet_torch,
-                    self.hf_model_name,
-                    self.batch_size,
-                    self.height,
-                    self.width,
-                    self.precision,
-                    self.max_length,
-                    None,
-                    "vmfb",
-                    self.external_weights,
-                    unet_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
-                    self.ireec_flags["unet"],
-                    self.decomp_attn,
-                    exit_on_vmfb=False,
-                    pipeline_dir=self.pipeline_dir,
-                    attn_spec=self.attn_spec,
-                    input_mlir=input_mlir["unet"],
-                    weights_only=weights_only,
-                )
-                return unet_vmfb, unet_external_weight_path
-            case "vae_decode":
-                if not input_mlir[submodel]:
-                    vae_torch = self.get_torch_models("vae_decode")
-                else:
-                    vae_torch = None
-                vae_decode_vmfb = vae.export_vae_model(
-                    vae_torch,
-                    self.hf_model_name,
-                    self.batch_size,
-                    self.height,
-                    self.width,
-                    self.precision,
-                    "vmfb",
-                    self.external_weights,
-                    vae_external_weight_path,
-                    self.device,
-                    self.iree_target_triple,
-                    self.ireec_flags["vae"],
-                    "decode",
-                    self.vae_decomp_attn,
-                    exit_on_vmfb=False,
-                    pipeline_dir=self.pipeline_dir,
-                    attn_spec=self.attn_spec,
-                    input_mlir=input_mlir["vae_decode"],
-                    weights_only=weights_only,
-                )
-                return vae_decode_vmfb, vae_external_weight_path
+            self.map["unet"]["keywords"].append("!punet")
+            self.map["unet"]["function_name"] = "run_forward"
 
     # LOAD
 
-    def load_pipeline(
+    def load_scheduler(
         self,
-        vmfbs: dict,
-        weights: dict,
-        rt_device: str = "local-task",
-        compiled_pipeline: bool = False,
+        scheduler_id: str,
+        steps: int = 30,
     ):
-        self.is_img2img = False
-        self.runners = {}
-        runners = {}
-        self.tokenizers = []
-        self.tokenizers.append(
-            CLIPTokenizer.from_pretrained(
-                self.hf_model_name,
-                subfolder="tokenizer",
-            )
+        self.scheduler = schedulers.get_scheduler(
+            self.base_model_name, self.scheduler_id
         )
-        if self.is_sdxl:
-            self.tokenizers.append(
-                CLIPTokenizer.from_pretrained(
-                    self.hf_model_name,
-                    subfolder="tokenizer_2",
+        if self.is_sd3:
+            scheduler_device = self.mmdit.device
+        else:
+            scheduler_device = self.unet.device
+        if not self.cpu_scheduling:
+            self.scheduler = None
+            self.num_inference_steps = steps
+            self.scheduler_id = scheduler_id
+            scheduler_path = f"{scheduler_id}Scheduler_{self.num_inference_steps}"
+            if not os.path.exists(scheduler_path):
+                scheduler_path, _ = self.export_submodel("scheduler")
+            try:
+                self.scheduler = schedulers.SharkSchedulerWrapper(
+                    scheduler_device,
+                    scheduler_path,
                 )
-            )
-        runners["clip"] = vmfbRunner(rt_device, vmfbs["clip"], weights["clip"])
-        runners["unet"] = vmfbRunner(rt_device, vmfbs["unet"], weights["unet"])
-        runners["vae_decode"] = vmfbRunner(
-            rt_device, vmfbs["vae_decode"], weights["vae_decode"]
-        )
-        self.runners = runners
-        self.compiled_pipeline = False
+            except:
+                print("JIT export of scheduler failed. Loading CPU scheduler.")
+                self.cpu_scheduling = True
         if self.cpu_scheduling:
-            # torch_scheduler = schedulers.SchedulingModel(
-            #     schedulers.get_scheduler(self.hf_model_name, self.scheduler_id),
-            #     self.height,
-            #     self.width,
-            #     self.num_inference_steps,
-            #     self.torch_dtype,
-            # )
-            # self.scheduler = schedulers.SharkSchedulerCPUWrapper(
-            #     self, torch_scheduler
-            # )
-            self.scheduler = schedulers.get_scheduler(
-                self.hf_model_name, self.scheduler_id
-            )
-        else:
-            self.scheduler = schedulers.SharkSchedulerWrapper(
-                rt_device, vmfbs["scheduler"], weights["scheduler"]
+            scheduler = schedulers.get_scheduler(self.base_model_name, scheduler_id)
+            self.scheduler = schedulers.SharkSchedulerCPUWrapper(
+                scheduler,
+                self.batch_size,
+                scheduler_device,
+                latents_dtype=self.latents_dtype,
             )
-        print("Successfully loaded pipeline.")
+            if self.use_punet:
+                self.scheduler.use_punet = True
 
     # RUN
 
+    def encode_prompts_sdxl(self, prompt, negative_prompt):
+        # Tokenize prompt and negative prompt.
+        text_input_ids_list = []
+        uncond_input_ids_list = []
+
+        for tokenizer in self.tokenizers:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_input = tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=self.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids_list += text_inputs.input_ids.unsqueeze(0)
+            uncond_input_ids_list += uncond_input.input_ids.unsqueeze(0)
+
+        if self.compiled_pipeline:
+            return text_input_ids_list, uncond_input_ids_list
+        else:
+            prompt_embeds, add_text_embeds = self.text_encoder(
+                "encode_prompts", [*text_input_ids_list, *uncond_input_ids_list]
+            )
+            return prompt_embeds, add_text_embeds
+
     def prepare_latents(
         self,
         noise,
         num_inference_steps,
-        image,
-        strength,
+        image=None,
+        strength=None,
     ):
-        self.scheduler.set_timesteps(num_inference_steps)
         if self.is_img2img:
-            init_timestep = min(
-                int(num_inference_steps * strength), num_inference_steps
+            raise NotImplementedError("Image-to-image not supported yet.")
+        elif self.is_sdxl:
+            sample, add_time_ids, step_indexes, timesteps = (
+                self.scheduler.initialize_sdxl(noise, num_inference_steps)
             )
-            t_start = max(num_inference_steps - init_timestep, 0)
-            timesteps = self.scheduler.timesteps[t_start:]
-            latents = self.encode_image(image)
-            latents = self.scheduler.add_noise(latents, noise, timesteps[0].repeat(1))
-            return latents, [timesteps]
+            return sample, add_time_ids, step_indexes, timesteps
+        elif self.is_sd3:
+            raise NotImplementedError("Stable Diffusion 3 not supported yet.")
         else:
-            self.scheduler.is_scale_input_called = True
-            latents = noise * self.scheduler.init_noise_sigma
-            return latents, self.scheduler.timesteps
+            sample, timesteps = self.scheduler.initialize_sd(noise, num_inference_steps)
+            return sample, timesteps
 
-    def generate_images(
-        self,
-        prompt: str,
-        negative_prompt: str = "",
-        batch_count: int = 1,
-        guidance_scale: float = 7.5,
-        seed: float = -1,
-        return_imgs: bool = False,
-    ):
-        pipe_start = time.time()
+    def get_rand_latents(self, seed, batch_count):
         samples = []
-        numpy_images = []
-
         uint32_info = np.iinfo(np.uint32)
         uint32_min, uint32_max = uint32_info.min, uint32_info.max
         if seed < uint32_min or seed >= uint32_max:
             seed = randint(uint32_min, uint32_max)
-
-        generator = torch.manual_seed(seed)
         for i in range(batch_count):
-            generator = torch.random.manual_seed(seed + i)
+            generator = torch.manual_seed(seed + i)
             rand_sample = torch.randn(
                 (
                     self.batch_size,
@@ -449,110 +453,163 @@ def generate_images(
                     self.width // 8,
                 ),
                 generator=generator,
-                dtype=self.torch_dtype,
+                dtype=self.latents_dtype,
             )
             samples.append(rand_sample)
-            # samples.append(
-            #     ireert.asdevicearray(
-            #         self.runners["unet"].config.device,
-            #         rand_sample,
-            #         dtype=self.iree_dtype,
-            #     )
-            # )
-
-        guidance_scale = ireert.asdevicearray(
-            self.runners["unet"].config.device,
-            np.asarray([guidance_scale]),
-            dtype=self.iree_dtype,
-        )
-
-        tokenize_start = time.time()
-
-        # Tokenize prompt and negative prompt.
+        return samples
 
-        prompt_embeds, negative_embeds = get_weighted_text_embeddings(
-            self, prompt, negative_prompt
+    def _produce_latents_sd(
+        self,
+        sample,
+        prompt_embeds,
+        negative_prompt_embeds,
+        steps,
+        guidance_scale,
+    ):
+        image = None
+        strength = 0
+        sample, timesteps = self.prepare_latents(
+            sample, self.num_inference_steps, image, strength
         )
+        text_embeddings = torch.cat((negative_prompt_embeds, prompt_embeds), dim=0)
+        self.scheduler.do_guidance = False
+        for i, t in tqdm(enumerate(timesteps)):
+            latent_model_input, _ = self.scheduler.scale_model_input(sample, t)
+            timestep = torch.tensor([t])
+            unet_inputs = [
+                latent_model_input,
+                timestep,
+            ]
+            unet_inputs.extend([text_embeddings, [guidance_scale]])
+            latents = self.unet(self.map["unet"]["function_name"], unet_inputs)
+            sample = self.scheduler.step(
+                torch.tensor(
+                    latents, dtype=torch_dtypes[self.map["unet"]["precision"]]
+                ),
+                t,
+                sample,
+            )
+        return sample
 
-        text_embeddings = torch.cat((negative_embeds, prompt_embeds), dim=0)
-        text_embeddings = ireert.asdevicearray(
-            self.runners["unet"].config.device,
-            text_embeddings,
-            dtype=self.iree_dtype,
+    def _produce_latents_sdxl(
+        self,
+        sample,
+        prompt_embeds,
+        add_text_embeds,
+        steps,
+        guidance_scale,
+    ):
+        image = None
+        strength = 0
+        latents, add_time_ids, step_indexes, timesteps = self.prepare_latents(
+            sample, self.num_inference_steps, image, strength
         )
-        encode_prompts_end = time.time()
-
-        for i in range(batch_count):
-            unet_start = time.time()
-            image = None
-            strength = 0
-            sample, timesteps = self.prepare_latents(
-                samples[i], self.num_inference_steps, image, strength
+        self.scheduler.do_guidance = False
+        self.scheduler.repeat_sample = False
+        for i, t in tqdm(enumerate(timesteps)):
+            if self.cpu_scheduling:
+                step_index = i
+            else:
+                step_index = torch.tensor([i])
+            latent_model_input, t = self.scheduler.scale_model_input(
+                latents,
+                t,
             )
-
-            for i, t in tqdm(enumerate(timesteps)):
-                latents = self.scheduler.scale_model_input(sample, t).to(
-                    self.torch_dtype
+            unet_inputs = [
+                latent_model_input,
+                t,
+                prompt_embeds,
+                add_text_embeds,
+                add_time_ids,
+                ireert.asdevicearray(
+                    self.unet.device,
+                    [guidance_scale],
+                    dtype=self.map["unet"]["np_dtype"],
+                ),
+            ]
+            if self.use_punet:
+                unet_inputs[1] = ireert.asdevicearray(
+                    self.unet.device,
+                    t,
+                    dtype=self.map["unet"]["np_dtype"],
                 )
-                timestep = torch.tensor([t]).to(self.torch_dtype).detach().numpy()
-                unet_inputs = [
-                    latents,
-                    timestep,
-                ]
-                if self.cpu_scheduling:
-                    for inp in unet_inputs:
-                        inp = ireert.asdevicearray(
-                            self.runners["unet"].config.device,
-                            inp,
-                            dtype=self.iree_dtype,
+                for inp_idx, inp in enumerate(unet_inputs):
+                    if not isinstance(inp, ireert.DeviceArray):
+                        unet_inputs[inp_idx] = ireert.asdevicearray(
+                            self.unet.device, inp, dtype=self.map["unet"]["np_dtype"]
                         )
-                unet_inputs.extend([text_embeddings, guidance_scale])
-                latents = self.runners["unet"].ctx.modules.compiled_unet["main"](
-                    *unet_inputs
-                )
-                sample = self.scheduler.step(
-                    torch.tensor(latents.to_host(), dtype=self.torch_dtype), t, sample
-                ).prev_sample
-
-            vae_start = time.time()
-            vae_out = self.runners["vae_decode"].ctx.modules.compiled_vae["main"](
-                sample
+            noise_pred = self.unet(
+                self.map["unet"]["function_name"],
+                unet_inputs,
             )
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+            )
+        return latents
 
-            pipe_end = time.time()
+    def generate_images(
+        self,
+        prompt: str,
+        negative_prompt: str = "",
+        steps: int = 30,
+        batch_count: int = 1,
+        guidance_scale: float = 7.5,
+        seed: float = -1,
+        cpu_scheduling: bool = True,
+        scheduler_id: str = "EulerDiscrete",
+        return_imgs: bool = False,
+    ):
+        needs_new_scheduler = (
+            (steps and steps != self.num_inference_steps)
+            or (cpu_scheduling != self.cpu_scheduling)
+            and self.split_scheduler
+        )
+        if not self.scheduler and not self.compiled_pipeline:
+            needs_new_scheduler = True
 
-            image = vae_out.to_host()
+        if guidance_scale == 0:
+            negative_prompt = prompt
+            prompt = ""
 
-            numpy_images.append(image)
-            print("Batch #", i + 1, "\n")
-            print(
-                "UNet time(",
-                self.num_inference_steps,
-                "): ",
-                vae_start - unet_start,
-                "sec,",
-            )
-            print(
-                "Unet average step latency: ",
-                (vae_start - unet_start) / self.num_inference_steps,
-                "sec",
-            )
-            print("VAE time: ", pipe_end - vae_start, "sec")
-            print(
-                f"\nTotal time (txt2img, batch #{str(i+1)}): ",
-                (encode_prompts_end - tokenize_start) + (pipe_end - unet_start),
-                "sec\n",
+        self.cpu_scheduling = cpu_scheduling
+        if steps and needs_new_scheduler:
+            self.num_inference_steps = steps
+            self.load_scheduler(scheduler_id, steps)
+
+        pipe_start = time.time()
+        numpy_images = []
+
+        samples = self.get_rand_latents(seed, batch_count)
+
+        # Tokenize prompt and negative prompt.
+        if self.is_sdxl:
+            prompt_embeds, negative_embeds = self.encode_prompts_sdxl(
+                prompt, negative_prompt
             )
-        end = time.time()
-        print("Total CLIP time:", encode_prompts_end - tokenize_start, "sec")
-        print("Total tokenize time:", tokenize_start - tokenize_start, "sec")
-        print("Loading time: ", tokenize_start - pipe_start, "sec")
-        if batch_count > 1:
-            print(
-                f"Total inference time ({batch_count} batch(es)):",
-                end - tokenize_start,
-                "sec",
+        else:
+            prompt_embeds, negative_embeds = encode_prompt(
+                self, prompt, negative_prompt
             )
+
+        for i in range(batch_count):
+            produce_latents_input = [
+                samples[i],
+                prompt_embeds,
+                negative_embeds,
+                steps,
+                guidance_scale,
+            ]
+            if self.is_sdxl:
+                latents = self._produce_latents_sdxl(*produce_latents_input)
+            else:
+                latents = self._produce_latents_sd(*produce_latents_input)
+            image = self.vae("decode", [latents])
+            numpy_images.append(image)
+            pipe_end = time.time()
+
+        logging.info(f"Total inference time: {pipe_end - pipe_start:.2f}s")
         timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
         images = []
         for idx, image in enumerate(numpy_images):
@@ -587,9 +644,6 @@ def numpy_to_pil_image(images):
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
 
-    mlirs = copy.deepcopy(SUBMODELS)
-    vmfbs = copy.deepcopy(SUBMODELS)
-    weights = copy.deepcopy(SUBMODELS)
     ireec_flags = {
         "clip": args.ireec_flags + args.clip_flags,
         "scheduler": args.ireec_flags,
@@ -597,37 +651,22 @@ def numpy_to_pil_image(images):
         "vae_decode": args.ireec_flags + args.vae_flags,
     }
     if not args.pipeline_dir:
-        pipe_id_list = [
-            utils.create_safe_name(args.hf_model_name, args.iree_target_triple),
-            str(args.height),
-            str(args.width),
-            str(args.max_length),
-            args.precision,
-            args.device,
-        ]
-        args.pipeline_dir = os.path.join(
-            ".",
-            "_".join(pipe_id_list),
-        )
-    if args.input_mlir:
-        user_mlir_list = args.input_mlir.split(",")
-    else:
-        user_mlir_list = []
-    for submodel_id, mlir_path in zip(mlirs.keys(), user_mlir_list):
-        if submodel_id in mlir_path:
-            mlirs[submodel_id] = mlir_path
-    if not args.external_weights_dir and args.external_weights:
-        args.external_weights_dir = args.pipeline_dir
-
+        args.pipeline_dir = utils.create_safe_name(args.hf_model_name, "")
+    if any(x for x in [args.vae_decomp_attn, args.unet_decomp_attn]):
+        args.decomp_attn = {
+            "text_encoder": args.decomp_attn,
+            "unet": (
+                args.unet_decomp_attn if args.unet_decomp_attn else args.decomp_attn
+            ),
+            "vae": args.vae_decomp_attn if args.vae_decomp_attn else args.decomp_attn,
+        }
     sd_pipe = SharkSDPipeline(
         args.hf_model_name,
-        args.scheduler_id,
         args.height,
         args.width,
-        args.precision,
-        args.max_length,
         args.batch_size,
-        args.num_inference_steps,
+        args.max_length,
+        args.precision,
         args.device,
         args.iree_target_triple,
         ireec_flags,
@@ -636,16 +675,23 @@ def numpy_to_pil_image(images):
         args.pipeline_dir,
         args.external_weights_dir,
         args.external_weights,
-        args.vae_decomp_attn,
+        args.num_inference_steps,
+        args.cpu_scheduling,
+        args.scheduler_id,
+        None,
+        args.use_i8_punet,
     )
-    vmfbs, weights = sd_pipe.check_prepared(mlirs, vmfbs, weights)
-    sd_pipe.load_pipeline(vmfbs, weights, args.rt_device, args.compiled_pipeline)
+    sd_pipe.prepare_all()
+    sd_pipe.load_map()
     sd_pipe.generate_images(
         args.prompt,
         args.negative_prompt,
+        args.num_inference_steps,
         args.batch_count,
         args.guidance_scale,
         args.seed,
+        args.cpu_scheduling,
+        args.scheduler_id,
         False,
     )
     print("Image generation complete.")
diff --git a/models/turbine_models/custom_models/sd_inference/tokenization.py b/models/turbine_models/custom_models/sd_inference/tokenization.py
index cfc140c57..e35d37e06 100644
--- a/models/turbine_models/custom_models/sd_inference/tokenization.py
+++ b/models/turbine_models/custom_models/sd_inference/tokenization.py
@@ -3,416 +3,175 @@
 import re
 import torch
 import numpy as np
-
-re_attention = re.compile(
-    r"""
-\\\(|
-\\\)|
-\\\[|
-\\]|
-\\\\|
-\\|
-\(|
-\[|
-:([+-]?[.\d]+)\)|
-\)|
-]|
-[^\\()\[\]:]+|
-:
-""",
-    re.X,
-)
-
-
-def parse_prompt_attention(text):
-    """
-    Parses a string with attention tokens and returns a list of pairs:
-        text and its associated weight.
-    Accepted tokens are:
-      (abc) - increases attention to abc by a multiplier of 1.1
-      (abc:3.12) - increases attention to abc by a multiplier of 3.12
-      [abc] - decreases attention to abc by a multiplier of 1.1
-      \( - literal character '('
-      \[ - literal character '['
-      \) - literal character ')'
-      \] - literal character ']'
-      \\ - literal character '\'
-      anything else - just text
-    >>> parse_prompt_attention('normal text')
-    [['normal text', 1.0]]
-    >>> parse_prompt_attention('an (important) word')
-    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-    >>> parse_prompt_attention('(unbalanced')
-    [['unbalanced', 1.1]]
-    >>> parse_prompt_attention('\(literal\]')
-    [['(literal]', 1.0]]
-    >>> parse_prompt_attention('(unnecessary)(parens)')
-    [['unnecessaryparens', 1.1]]
-    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-    [['a ', 1.0],
-     ['house', 1.5730000000000004],
-     [' ', 1.1],
-     ['on', 1.0],
-     [' a ', 1.1],
-     ['hill', 0.55],
-     [', sun, ', 1.1],
-     ['sky', 1.4641000000000006],
-     ['.', 1.1]]
-    """
-
-    res = []
-    round_brackets = []
-    square_brackets = []
-
-    round_bracket_multiplier = 1.1
-    square_bracket_multiplier = 1 / 1.1
-
-    def multiply_range(start_position, multiplier):
-        for p in range(start_position, len(res)):
-            res[p][1] *= multiplier
-
-    for m in re_attention.finditer(text):
-        text = m.group(0)
-        weight = m.group(1)
-
-        if text.startswith("\\"):
-            res.append([text[1:], 1.0])
-        elif text == "(":
-            round_brackets.append(len(res))
-        elif text == "[":
-            square_brackets.append(len(res))
-        elif weight is not None and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), float(weight))
-        elif text == ")" and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), round_bracket_multiplier)
-        elif text == "]" and len(square_brackets) > 0:
-            multiply_range(square_brackets.pop(), square_bracket_multiplier)
-        else:
-            res.append([text, 1.0])
-
-    for pos in round_brackets:
-        multiply_range(pos, round_bracket_multiplier)
-
-    for pos in square_brackets:
-        multiply_range(pos, square_bracket_multiplier)
-
-    if len(res) == 0:
-        res = [["", 1.0]]
-
-    # merge runs of identical weights
-    i = 0
-    while i + 1 < len(res):
-        if res[i][1] == res[i + 1][1]:
-            res[i][0] += res[i + 1][0]
-            res.pop(i + 1)
-        else:
-            i += 1
-
-    return res
+import warnings
 
 
-def get_prompts_with_weights(tokenizer, prompt: List[str], max_length: int):
-    r"""
-    Tokenize a list of prompts and return its tokens with weights of each token.
-    No padding, starting or ending token is included.
-    """
-    tokens = []
-    weights = []
-    truncated = False
-    for text in prompt:
-        texts_and_weights = parse_prompt_attention(text)
-        text_token = []
-        text_weight = []
-        for word, weight in texts_and_weights:
-            # tokenize and discard the starting and the ending token
-            token = tokenizer(word).input_ids[1:-1]
-            text_token += token
-            # copy the weight by length of token
-            text_weight += [weight] * len(token)
-            # stop if the text is too long (longer than truncation limit)
-            if len(text_token) > max_length:
-                truncated = True
-                break
-        # truncate
-        if len(text_token) > max_length:
-            truncated = True
-            text_token = text_token[:max_length]
-            text_weight = text_weight[:max_length]
-        tokens.append(text_token)
-        weights.append(text_weight)
-    if truncated:
-        print(
-            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
-        )
-    return tokens, weights
-
-
-def pad_tokens_and_weights(
-    tokens,
-    weights,
-    max_length,
-    bos,
-    eos,
-    no_boseos_middle=True,
-    chunk_length=77,
-):
-    r"""
-    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
-    """
-    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = (
-        max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
-    )
-    for i in range(len(tokens)):
-        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
-        if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
-        else:
-            w = []
-            if len(weights[i]) == 0:
-                w = [1.0] * weights_length
-            else:
-                for j in range(max_embeddings_multiples):
-                    w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][
-                        j
-                        * (chunk_length - 2) : min(
-                            len(weights[i]), (j + 1) * (chunk_length - 2)
-                        )
-                    ]
-                    w.append(1.0)  # weight for ending token in this chunk
-                w += [1.0] * (weights_length - len(w))
-            weights[i] = w[:]
-
-    return tokens, weights
-
-
-def get_unweighted_text_embeddings(
+# The following is copied from Diffusers' "encode_prompt" function in the StableDiffusion pipeline.
+# It has been lightly augmented to work with the SHARK-Turbine pipeline.
+def encode_prompt(
     pipe,
-    text_input,
-    chunk_length: int,
-    no_boseos_middle: Optional[bool] = True,
+    prompt,
+    negative_prompt=None,
+    num_images_per_prompt=1,
+    do_classifier_free_guidance=True,
+    prompt_embeds: Optional[torch.Tensor] = None,
+    negative_prompt_embeds: Optional[torch.Tensor] = None,
+    lora_scale: Optional[float] = None,
+    clip_skip: Optional[int] = None,
 ):
+    r"""
+    Encodes the prompt into text encoder hidden states.
+
+    Args:
+        prompt (`str` or `List[str]`, *optional*):
+            prompt to be encoded
+        num_images_per_prompt (`int`):
+            number of images that should be generated per prompt
+        do_classifier_free_guidance (`bool`):
+            whether to use classifier free guidance or not
+        negative_prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts not to guide the image generation. If not defined, one has to pass
+            `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+            less than `1`).
+        prompt_embeds (`torch.Tensor`, *optional*):
+            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+            provided, text embeddings will be generated from `prompt` input argument.
+        negative_prompt_embeds (`torch.Tensor`, *optional*):
+            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+            argument.
+        lora_scale (`float`, *optional*):
+            A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        clip_skip (`int`, *optional*):
+            Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+            the output of the pre-final layer will be used for computing the prompt embeddings.
     """
-    When the length of tokens is a multiple of the capacity of the text encoder,
-    it should be split into chunks and sent to the text encoder individually.
-    """
-    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
-    if max_embeddings_multiples > 1:
-        text_embeddings = []
-        for i in range(max_embeddings_multiples):
-            # extract the i-th chunk
-            text_input_chunk = text_input[
-                :, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2
-            ].clone()
-
-            # cover the head and the tail by the starting and the ending tokens
-            text_input_chunk[:, 0] = text_input[0, 0]
-            text_input_chunk[:, -1] = text_input[0, -1]
-
-            text_input_chunk = ireert.asdevicearray(
-                pipe.runners["clip"].config.device, text_input_chunk, "int64"
-            )
-            text_embedding = (
-                pipe.runners["clip"].ctx.modules.compiled_clip["main"](text_input_chunk)
-            )[0].to_host()
-            if no_boseos_middle:
-                if i == 0:
-                    # discard the ending token
-                    text_embedding = text_embedding[:, :-1]
-                elif i == max_embeddings_multiples - 1:
-                    # discard the starting token
-                    text_embedding = text_embedding[:, 1:]
-                else:
-                    # discard both starting and ending tokens
-                    text_embedding = text_embedding[:, 1:-1]
-
-            text_embeddings.append(text_embedding)
-        # SHARK: Convert the result to tensor
-        # text_embeddings = torch.concat(text_embeddings, axis=1)
-        text_embeddings_np = np.concatenate(np.array(text_embeddings))
-        text_embeddings = torch.from_numpy(text_embeddings_np)
+    # set lora scale so that monkey patched LoRA
+    # function of text encoder can correctly access it
+    # if lora_scale is not None and pipe.use_lora:
+    #     pipe._lora_scale = lora_scale
+
+    #     # dynamically adjust the LoRA scale
+    #     if not USE_PEFT_BACKEND:
+    #         adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
+    #     else:
+    #         scale_lora_layers(pipe.text_encoder, lora_scale)
+
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
     else:
-        text_input = ireert.asdevicearray(
-            pipe.runners["clip"].config.device, text_input, "int64"
-        )
-        text_embeddings = (
-            pipe.runners["clip"].ctx.modules.compiled_clip["main"](text_input)
-        )[0].to_host()
-        text_embeddings = torch.from_numpy(text_embeddings)
-    return text_embeddings
-
-
-# This function deals with NoneType values occuring in tokens after padding
-# It switches out None with 49407 as truncating None values causes matrix dimension errors,
-def filter_nonetype_tokens(tokens: List[List]):
-    return [[49407 if token is None else token for token in tokens[0]]]
+        batch_size = prompt_embeds.shape[0]
 
+    if prompt_embeds is None:
+        # textual inversion: process multi-vector tokens if necessary
+        # if pipe.use_textual_inversion:
+        #     prompt = pipe.maybe_convert_prompt(prompt, pipe.tokenizer)
 
-def get_tokenized_inputs(
-    pipe,
-    tokenizer,
-    prompt,
-    uncond_prompt,
-    max_length,
-    max_embeddings_multiples: Optional[int] = 8,
-    no_boseos_middle: Optional[bool] = True,
-    skip_parsing: Optional[bool] = False,
-    skip_weighting: Optional[bool] = False,
-):
-    if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(
-            tokenizer, prompt, max_length - 2
+        text_inputs = pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=pipe.model_max_length,
+            truncation=True,
+            return_tensors="pt",
         )
-        if uncond_prompt is not None:
-            uncond_tokens, uncond_weights = get_prompts_with_weights(
-                tokenizer, uncond_prompt, max_length - 2
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = pipe.tokenizer(
+            prompt, padding="longest", return_tensors="pt"
+        ).input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = pipe.tokenizer.batch_decode(
+                untruncated_ids[:, pipe.model_max_length - 1 : -1]
             )
-    else:
-        prompt_tokens = [
-            token[1:-1]
-            for token in tokenizer(
-                prompt, max_length=max_length, truncation=True
-            ).input_ids
-        ]
-        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens = [
-                token[1:-1]
-                for token in tokenizer(
-                    uncond_prompt, max_length=max_length, truncation=True
-                ).input_ids
-            ]
-            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
-
-    # round up the longest length of tokens to a multiple of (model_max_length - 2)
-    max_length = max([len(token) for token in prompt_tokens])
-    if uncond_prompt is not None:
-        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
-    max_embeddings_multiples = min(
-        max_embeddings_multiples,
-        (max_length - 1) // (pipe.model_max_length - 2) + 1,
-    )
-    max_embeddings_multiples = max(1, max_embeddings_multiples)
-
-    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
-
-    # pad the length of tokens and weights
-    bos = tokenizer.bos_token_id
-    eos = tokenizer.eos_token_id
-    prompt_tokens, prompt_weights = pad_tokens_and_weights(
-        prompt_tokens,
-        prompt_weights,
-        max_length,
-        bos,
-        eos,
-        no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.model_max_length,
-    )
+            warnings.warn(
+                "The following text was removed due to truncation: " + removed_text
+            )
+        if pipe.text_encoder.metadata.get("use_attention_mask"):
+            attention_mask = text_inputs.attention_mask
+            prompt_embeds = pipe.text_encoder(
+                "encode_tokens_attn_mask", [text_input_ids, attention_mask]
+            )
+        else:
+            attention_mask = None
+            prompt_embeds = pipe.text_encoder("encode_tokens", [text_input_ids])
+        prompt_embeds = prompt_embeds[0]
+    bs_embed, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+    # get unconditional embeddings for classifier free guidance
+    if do_classifier_free_guidance and negative_prompt_embeds is None:
+        uncond_tokens: List[str]
+        if negative_prompt is None:
+            uncond_tokens = [""] * batch_size
+        elif prompt is not None and type(prompt) is not type(negative_prompt):
+            raise TypeError(
+                f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                f" {type(prompt)}."
+            )
+        elif isinstance(negative_prompt, str):
+            uncond_tokens = [negative_prompt]
+        elif batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+        else:
+            uncond_tokens = negative_prompt
 
-    # FIXME: This is a hacky fix caused by tokenizer padding with None values
-    prompt_tokens = filter_nonetype_tokens(prompt_tokens)
+        # textual inversion: process multi-vector tokens if necessary
+        # if pipe.use_textual_inversion:
+        #     uncond_tokens = pipe.maybe_convert_prompt(uncond_tokens, pipe.tokenizer)
 
-    # prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
-    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device="cpu")
-    if uncond_prompt is not None:
-        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+        max_length = prompt_embeds.shape[1]
+        uncond_input = pipe.tokenizer(
             uncond_tokens,
-            uncond_weights,
-            max_length,
-            bos,
-            eos,
-            no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.model_max_length,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
         )
 
-        # FIXME: This is a hacky fix caused by tokenizer padding with None values
-        uncond_tokens = filter_nonetype_tokens(uncond_tokens)
+        if pipe.text_encoder.metadata.get("use_attention_mask"):
+            attention_mask = uncond_input.attention_mask
+            negative_prompt_embeds = pipe.text_encoder(
+                "encode_tokens_attn_mask",
+                [
+                    uncond_input.input_ids,
+                    attention_mask,
+                ],
+            )
+        else:
+            attention_mask = None
+            negative_prompt_embeds = pipe.text_encoder(
+                "encode_tokens",
+                [
+                    uncond_input.input_ids,
+                ],
+            )
 
-        # uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
-        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device="cpu")
-    if uncond_prompt is not None:
-        return prompt_tokens, prompt_weights, uncond_tokens, uncond_weights
-    else:
-        return prompt_tokens, prompt_weights, None, None
+        negative_prompt_embeds = negative_prompt_embeds[0]
 
+    if do_classifier_free_guidance:
+        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+        seq_len = negative_prompt_embeds.shape[1]
 
-def get_weighted_text_embeddings(
-    pipe,
-    prompt: List[str],
-    uncond_prompt: List[str] = None,
-    max_embeddings_multiples: Optional[int] = 8,
-    no_boseos_middle: Optional[bool] = True,
-    skip_parsing: Optional[bool] = False,
-    skip_weighting: Optional[bool] = False,
-):
-    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
-    for tokenizer in pipe.tokenizers:
-        (
-            prompt_tokens,
-            prompt_weights,
-            uncond_tokens,
-            uncond_weights,
-        ) = get_tokenized_inputs(
-            pipe,
-            tokenizer,
-            prompt,
-            uncond_prompt,
-            max_length,
-            max_embeddings_multiples,
-            no_boseos_middle,
-            skip_parsing,
-            skip_weighting,
+        negative_prompt_embeds = negative_prompt_embeds.repeat(
+            1, num_images_per_prompt, 1
         )
-
-    # get the embeddings
-    text_embeddings = get_unweighted_text_embeddings(
-        pipe,
-        prompt_tokens,
-        pipe.model_max_length,
-        no_boseos_middle=no_boseos_middle,
-    )
-    # prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
-    prompt_weights = torch.tensor(prompt_weights, dtype=torch.float, device="cpu")
-    if uncond_prompt is not None:
-        uncond_embeddings = get_unweighted_text_embeddings(
-            pipe,
-            uncond_tokens,
-            pipe.model_max_length,
-            no_boseos_middle=no_boseos_middle,
+        negative_prompt_embeds = negative_prompt_embeds.view(
+            batch_size * num_images_per_prompt, seq_len, -1
         )
-        # uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
-        uncond_weights = torch.tensor(uncond_weights, dtype=torch.float, device="cpu")
 
-    # assign weights to the prompts and normalize in the sense of mean
-    # TODO: should we normalize by chunk or in a whole (current implementation)?
-    if (not skip_parsing) and (not skip_weighting):
-        previous_mean = (
-            text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
-        )
-        text_embeddings *= prompt_weights.unsqueeze(-1)
-        current_mean = (
-            text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
-        )
-        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
-        if uncond_prompt is not None:
-            previous_mean = (
-                uncond_embeddings.float()
-                .mean(axis=[-2, -1])
-                .to(uncond_embeddings.dtype)
-            )
-            uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            current_mean = (
-                uncond_embeddings.float()
-                .mean(axis=[-2, -1])
-                .to(uncond_embeddings.dtype)
-            )
-            uncond_embeddings *= (
-                (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
-            )
+    # if pipe.use_lora:
+    # Retrieve the original scale by scaling back the LoRA layers
+    # unimplemented
+    # unscale_lora_layers(pipe.text_encoder, lora_scale)
 
-    if uncond_prompt is not None:
-        return text_embeddings, uncond_embeddings
-    return text_embeddings, None
+    return prompt_embeds, negative_prompt_embeds
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index ac66d3108..dac967b8a 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -15,6 +15,7 @@
 from shark_turbine.dynamo.passes import (
     DEFAULT_DECOMPOSITIONS,
 )
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 from turbine_models.custom_models.sd_inference import utils
 import torch
 import torch._dynamo as dynamo
@@ -28,37 +29,38 @@
 class UnetModel(torch.nn.Module):
     def __init__(self, hf_model_name):
         super().__init__()
+        self.do_classifier_free_guidance = True
         self.unet = UNet2DConditionModel.from_pretrained(
             hf_model_name,
             subfolder="unet",
         )
 
-    def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
-        samples = torch.cat([sample] * 2)
-        unet_out = self.unet.forward(
-            samples, timestep, encoder_hidden_states, return_dict=False
+    def forward(
+        self, latent_model_input, timestep, encoder_hidden_states, guidance_scale
+    ):
+        noise_pred = self.unet.forward(
+            latent_model_input, timestep, encoder_hidden_states, return_dict=False
         )[0]
-        noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (
-            noise_pred_text - noise_pred_uncond
-        )
+        if self.do_classifier_free_guidance:
+            noise_preds = noise_pred.chunk(2)
+            noise_pred = noise_preds[0] + guidance_scale * (
+                noise_preds[1] - noise_preds[0]
+            )
         return noise_pred
 
 
 def export_unet_model(
-    unet_model,
     hf_model_name,
     batch_size,
     height,
     width,
     precision="fp32",
     max_length=77,
-    hf_auth_token=None,
     compile_to="torch",
     external_weights=None,
     external_weight_path=None,
     device=None,
-    target_triple=None,
+    target=None,
     ireec_flags=None,
     decomp_attn=False,
     exit_on_vmfb=False,
@@ -68,22 +70,28 @@ def export_unet_model(
     weights_only=False,
     upload_ir=False,
 ):
-    if "turbo" in hf_model_name:
-        do_classifier_free_guidance = False
-    else:
-        do_classifier_free_guidance = True
-    if pipeline_dir:
-        safe_name = os.path.join(pipeline_dir, f"unet")
+    if input_mlir:
+        unet_model = None
     else:
-        safe_name = utils.create_safe_name(
+        unet_model = UnetModel(
             hf_model_name,
-            f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet_{device}",
         )
+    dtype = torch.float16 if precision == "fp16" else torch.float32
+    np_dtype = "float16" if precision == "fp16" else "float32"
+    safe_name = utils.create_safe_name(
+        hf_model_name,
+        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet",
+    )
+    if decomp_attn:
+        safe_name += "_decomp_attn"
+    if pipeline_dir:
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             mlir_source="file",
@@ -93,15 +101,6 @@ def export_unet_model(
         return vmfb_path
 
     mapper = {}
-    decomp_list = copy.deepcopy(DEFAULT_DECOMPOSITIONS)
-    if decomp_attn == True:
-        decomp_list.extend(
-            [
-                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-                torch.ops.aten._scaled_dot_product_flash_attention.default,
-            ]
-        )
-    dtype = torch.float16 if precision == "fp16" else torch.float32
 
     if precision == "fp16":
         unet_model = unet_model.half()
@@ -114,76 +113,96 @@ def export_unet_model(
         return external_weight_path
 
     sample = (
-        batch_size,
+        batch_size * 2,
         unet_model.unet.config.in_channels,
         height // 8,
         width // 8,
     )
-
     encoder_hidden_states_sizes = (
         unet_model.unet.config.layers_per_block,
         max_length,
         unet_model.unet.config.cross_attention_dim,
     )
-
-    class CompiledUnet(CompiledModule):
-        if external_weights:
-            params = export_parameters(
-                unet_model, external=True, external_scope="", name_mapper=mapper.get
-            )
-        else:
-            params = export_parameters(unet_model)
-
-        def main(
-            self,
-            sample=AbstractTensor(*sample, dtype=dtype),
-            timestep=AbstractTensor(1, dtype=dtype),
-            encoder_hidden_states=AbstractTensor(
-                *encoder_hidden_states_sizes, dtype=dtype
-            ),
-            guidance_scale=AbstractTensor(1, dtype=dtype),
+    example_forward_args = [
+        torch.empty(sample, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+        torch.empty(encoder_hidden_states_sizes, dtype=dtype),
+        torch.empty(1, dtype=dtype),
+    ]
+    decomp_list = []
+    if decomp_attn:
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+        fxb = FxProgramsBuilder(unet_model)
+
+        @fxb.export_program(
+            args=(example_forward_args,),
+        )
+        def _forward(
+            module,
+            inputs,
         ):
-            return jittable(unet_model.forward, decompose_ops=decomp_list)(
-                sample, timestep, encoder_hidden_states, guidance_scale
-            )
-
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledUnet(context=Context(), import_to=import_to)
+            return module.forward(*inputs)
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
+        class CompiledUnet(CompiledModule):
+            run_forward = _forward
 
+        if external_weights:
+            externalize_module_parameters(unet_model)
+
+        inst = CompiledUnet(context=Context(), import_to="IMPORT")
+
+        module = CompiledModule.get_mlir_module(inst)
+
+    model_metadata_run_forward = {
+        "model_name": "sd_unet",
+        "input_shapes": [
+            sample,
+            (1,),
+            encoder_hidden_states_sizes,
+            (1,),
+        ],
+        "input_dtypes": [np_dtype for x in range(4)],
+        "output_shapes": [sample],
+        "output_dtypes": [np_dtype],
+    }
+
+    module = AddMetadataPass(module, model_metadata_run_forward, "run_forward").run()
+    module_str = str(module)
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(
+        vmfb_path = utils.compile_to_vmfb(
             module_str,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
-            return_path=False,
+            return_path=True,
             attn_spec=attn_spec,
         )
+        if exit_on_vmfb:
+            exit()
+    return vmfb_path
 
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
 
-    if args.input_mlir:
-        unet_model = None
-    else:
-        unet_model = UnetModel(
-            args.hf_model_name,
-        )
     mod_str = export_unet_model(
-        unet_model,
         args.hf_model_name,
         args.batch_size,
         args.height,
         args.width,
         args.precision,
         args.max_length,
-        args.hf_auth_token,
         args.compile_to,
         args.external_weights,
         args.external_weight_path,
diff --git a/models/turbine_models/custom_models/sd_inference/unet_runner.py b/models/turbine_models/custom_models/sd_inference/unet_runner.py
index 172229e77..12e420960 100644
--- a/models/turbine_models/custom_models/sd_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/unet_runner.py
@@ -15,16 +15,18 @@ def run_unet(
     hf_model_name,
     hf_auth_token,
     external_weight_path,
+    iree_dtype,
 ):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
-
     inputs = [
-        ireert.asdevicearray(runner.config.device, sample),
-        ireert.asdevicearray(runner.config.device, timestep),
-        ireert.asdevicearray(runner.config.device, encoder_hidden_states),
-        ireert.asdevicearray(runner.config.device, guidance_scale),
+        ireert.asdevicearray(runner.config.device, sample, dtype=iree_dtype),
+        ireert.asdevicearray(runner.config.device, timestep, dtype=iree_dtype),
+        ireert.asdevicearray(
+            runner.config.device, encoder_hidden_states, dtype=iree_dtype
+        ),
+        ireert.asdevicearray(runner.config.device, guidance_scale, dtype=iree_dtype),
     ]
-    results = runner.ctx.modules.compiled_unet["main"](*inputs)
+    results = runner.ctx.modules.compiled_unet["run_forward"](*inputs)
     return results
 
 
@@ -36,32 +38,10 @@ def run_torch_unet(
     encoder_hidden_states,
     guidance_scale,
 ):
-    from diffusers import UNet2DConditionModel
-
-    class UnetModel(torch.nn.Module):
-        def __init__(self, hf_model_name, hf_auth_token):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                hf_model_name,
-                subfolder="unet",
-                token=hf_auth_token,
-            )
-            self.guidance_scale = 7.5
-
-        def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
-            samples = torch.cat([sample] * 2)
-            unet_out = self.unet.forward(
-                samples, timestep, encoder_hidden_states, return_dict=False
-            )[0]
-            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
-            return noise_pred
+    from turbine_models.custom_models.sd_inference.unet import UnetModel
 
     unet_model = UnetModel(
         hf_model_name,
-        hf_auth_token,
     )
     results = unet_model.forward(
         sample, timestep, encoder_hidden_states, guidance_scale
@@ -72,15 +52,21 @@ def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    iree_dtypes = {
+        "fp16": "float16",
+        "fp32": "float32",
+    }
     sample = torch.rand(
-        args.batch_size, 4, args.height // 8, args.width // 8, dtype=torch.float32
+        args.batch_size * 2, 4, args.height // 8, args.width // 8, dtype=torch.float32
     )
     timestep = torch.zeros(1, dtype=torch.float32)
     guidance_scale = torch.Tensor([7.5], dtype=torch.float32)
     if args.hf_model_name == "CompVis/stable-diffusion-v1-4":
-        encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32)
+        encoder_hidden_states = torch.rand(2, args.max_length, 768, dtype=torch.float32)
     elif args.hf_model_name == "stabilityai/stable-diffusion-2-1-base":
-        encoder_hidden_states = torch.rand(2, 77, 1024, dtype=torch.float32)
+        encoder_hidden_states = torch.rand(
+            2, args.max_length, 1024, dtype=torch.float32
+        )
 
     turbine_output = run_unet(
         args.device,
@@ -92,6 +78,7 @@ def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
         args.hf_model_name,
         args.hf_auth_token,
         args.external_weight_path,
+        iree_dtypes[args.precision],
     )
     print(
         "TURBINE OUTPUT:",
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index cf6b5946a..447076d42 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -12,17 +12,6 @@
     # DPMSolverSDEScheduler,
 )
 
-_IREE_DEVICE_MAP = {
-    "cpu": "local-task",
-    "cpu-task": "local-task",
-    "cpu-sync": "local-sync",
-    "cuda": "cuda",
-    "vulkan": "vulkan",
-    "metal": "metal",
-    "rocm": "rocm",
-    "hip": "hip",
-    "intel-gpu": "level_zero",
-}
 # If flags are verified to work on a specific model and improve performance without regressing numerics, add them to this dictionary. If you are working with bleeding edge flags, please add them manually with the --ireec_flags argument.
 MI_flags = {
     "all": [
@@ -102,22 +91,53 @@
     ],
 }
 
+_IREE_DRIVER_MAP = {
+    "cpu": "local-task",
+    "cpu-task": "local-task",
+    "cpu-sync": "local-sync",
+    "cuda": "cuda",
+    "vulkan": "vulkan",
+    "metal": "metal",
+    "rocm": "hip",
+    "rocm-legacy": "rocm",
+    "hip": "hip",
+    "intel-gpu": "level_zero",
+}
+
+_IREE_BACKEND_MAP = {
+    "cpu": "llvm-cpu",
+    "rocm": "rocm",
+    "rocm-legacy": "rocm",
+    "hip": "rocm",
+    "cuda": "cuda",
+    "vulkan": "vulkan-spirv",
+    "metal": "metal",
+}
+
 
 def iree_device_map(device):
     uri_parts = device.split("://", 2)
     iree_driver = (
-        _IREE_DEVICE_MAP[uri_parts[0]]
-        if uri_parts[0] in _IREE_DEVICE_MAP
+        _IREE_DRIVER_MAP[uri_parts[0]]
+        if uri_parts[0] in _IREE_DRIVER_MAP
         else uri_parts[0]
     )
     if len(uri_parts) == 1:
         return iree_driver
-    elif "rocm" in uri_parts:
-        return "rocm"
     else:
         return f"{iree_driver}://{uri_parts[1]}"
 
 
+def iree_backend_map(device):
+    uri_parts = device.split("://", 2)
+    iree_device = (
+        _IREE_BACKEND_MAP[uri_parts[0]]
+        if uri_parts[0] in _IREE_BACKEND_MAP
+        else uri_parts[0]
+    )
+    return iree_device
+
+
 def compile_to_vmfb(
     module_str,
     device,
@@ -132,6 +152,7 @@ def compile_to_vmfb(
     attn_spec=None,
     winograd=False,
     masked_attention=False,
+    debug=False,
 ):
     flags = []
     if mlir_source == "file" and not isinstance(module_str, str):
@@ -173,7 +194,7 @@ def compile_to_vmfb(
             ]
         )
         device = "vulkan-spirv"
-    elif device == "rocm":
+    elif device in ["rocm", "hip"]:
         flags.extend(
             [
                 "--iree-hal-target-backends=rocm",
@@ -199,7 +220,6 @@ def compile_to_vmfb(
     elif ireec_flags == None:
         ireec_flags = []
 
-    debug = False
     if debug:
         flags.extend(
             ["--iree-hal-dump-executable-files-to=" + safe_name + "_dispatches"]
@@ -229,11 +249,17 @@ def compile_to_vmfb(
     # This 'attn_spec' handles a linalg_ext.attention op lowering to mfma instructions for capable targets.
     # This is a temporary solution, and should be removed or largely disabled once the functionality of
     # the TD spec is implemented in C++.
-    if attn_spec in ["default", "mfma"]:
+
+    if attn_spec in ["default", "mfma", "punet"]:
+        use_punet = True if attn_spec in ["punet", "i8"] else False
         attn_spec = get_mfma_spec_path(
-            target_triple, os.path.dirname(safe_name), masked_attention
+            target_triple,
+            os.path.dirname(safe_name),
+            masked_attention,
+            use_punet=use_punet,
         )
         flags.extend(["--iree-codegen-transform-dialect-library=" + attn_spec])
+
     elif attn_spec in ["wmma"] or ("gfx11" in target_triple and not attn_spec):
         attn_spec = get_wmma_spec_path(
             target_triple, os.path.dirname(safe_name), masked_attention
@@ -300,22 +326,25 @@ def compile_to_vmfb(
         return safe_vmfb_name + ".vmfb"
 
 
-def create_safe_name(hf_model_name, model_name_str):
+def create_safe_name(hf_model_name, model_name_str=""):
     safe_name = hf_model_name.split("/")[-1].strip() + model_name_str
     safe_name = re.sub("-", "_", safe_name)
     safe_name = re.sub("\.", "_", safe_name)
     return safe_name
 
 
-def get_mfma_spec_path(target_chip, save_dir, masked_attention=False):
-    if not masked_attention:
+def get_mfma_spec_path(target_chip, save_dir, masked_attention=False, use_punet=False):
+    if use_punet:
+        suffix = "_punet"
+        url = "https://raw.githubusercontent.com/nod-ai/sdxl-scripts/main/int8-model/specs/attention_and_matmul_spec.mlir"
+    elif not masked_attention:
+        suffix = ""
         url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/no_pad/attention_and_matmul_spec_mfma.mlir"
     else:
+        suffix = "_pad"
         url = "https://sharkpublic.blob.core.windows.net/sharkpublic/specs/latest/attention_and_matmul_spec_gfx942.mlir"
     attn_spec = urlopen(url).read().decode("utf-8")
-    spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
-    if os.path.exists(spec_path):
-        return spec_path
+    spec_path = os.path.join(save_dir, f"attention_and_matmul_spec_mfma{suffix}.mlir")
     with open(spec_path, "w") as f:
         f.write(attn_spec)
     return spec_path
@@ -331,7 +360,8 @@ def get_wmma_spec_path(target_chip, save_dir, masked_attention=False):
     else:
         return None
     attn_spec = urlopen(url).read().decode("utf-8")
-    spec_path = os.path.join(save_dir, "attention_and_matmul_spec_wmma.mlir")
+    suffix = "masked" if masked_attention else ""
+    spec_path = os.path.join(save_dir, f"attention_and_matmul_spec_wmma{suffix}.mlir")
     with open(spec_path, "w") as f:
         f.write(attn_spec)
     return spec_path
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 475cf1d1d..d9c0fd743 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -13,6 +13,7 @@
 from shark_turbine.dynamo.passes import (
     DEFAULT_DECOMPOSITIONS,
 )
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 from turbine_models.custom_models.sd_inference import utils
 import torch
 import torch._dynamo as dynamo
@@ -54,30 +55,58 @@ def __init__(
             )
             self.vae.load_state_dict(custom_vae)
 
-    def decode_inp(self, inp):
-        inp = 1 / 0.18215 * inp
+    def decode(self, inp):
+        inp = 1 / self.vae.config.scaling_factor * inp
         x = self.vae.decode(inp, return_dict=False)[0]
         return (x / 2 + 0.5).clamp(0, 1)
 
-    def encode_inp(self, inp):
+    def encode(self, inp):
         latents = self.vae.encode(inp).latent_dist.sample()
-        return 0.18215 * latents
+        return self.vae.config.scaling_factor * latents
+
+
+class SD3VaeModel(torch.nn.Module):
+    def __init__(
+        self,
+        hf_model_name,
+    ):
+        super().__init__()
+        self.vae = AutoencoderKL.from_pretrained(
+            hf_model_name,
+            subfolder="vae",
+        )
+
+    def decode(self, inp):
+        inp = (inp / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+        image = self.vae.decode(inp, return_dict=False)[0]
+        image = image.float()
+        image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
+        return image
+
+    def encode(self, inp):
+        image_np = inp / 255.0
+        image_np = np.moveaxis(image_np, 2, 0)
+        batch_images = np.expand_dims(image_np, axis=0).repeat(1, axis=0)
+        image_torch = torch.from_numpy(batch_images)
+        image_torch = 2.0 * image_torch - 1.0
+        image_torch = image_torch
+        latent = self.vae.encode(image_torch)
+        return latent
 
 
 def export_vae_model(
-    vae_model,
     hf_model_name,
     batch_size,
     height,
     width,
     precision,
     compile_to="torch",
+    num_channels=4,
     external_weights=None,
     external_weight_path=None,
     device=None,
-    target_triple=None,
+    target=None,
     ireec_flags=None,
-    variant="decode",
     decomp_attn=False,
     exit_on_vmfb=False,
     pipeline_dir=None,
@@ -86,18 +115,22 @@ def export_vae_model(
     weights_only=False,
     upload_ir=False,
 ):
+    dtype = torch.float16 if precision == "fp16" else torch.float32
+    np_dtype = "float16" if precision == "fp16" else "float32"
+    safe_name = utils.create_safe_name(
+        hf_model_name,
+        f"_bs{batch_size}_{height}x{width}_{precision}_vae",
+    )
+    if decomp_attn:
+        safe_name += "_decomp_attn"
     if pipeline_dir:
-        safe_name = os.path.join(pipeline_dir, "vae_" + variant)
-    else:
-        safe_name = utils.create_safe_name(
-            hf_model_name,
-            f"_bs{batch_size}_{height}x{width}_{precision}_vae_{variant}_{device}",
-        )
+        safe_name = os.path.join(pipeline_dir, safe_name)
+
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             mlir_source="file",
@@ -105,46 +138,94 @@ def export_vae_model(
             attn_spec=attn_spec,
         )
         return vmfb_path
+
+    if "stable-diffusion-3" in hf_model_name:
+        vae_model = SD3VaeModel(hf_model_name)
+    else:
+        if "xl" in hf_model_name and precision == "fp16":
+            custom_vae = "madebyollin/sdxl-vae-fp16-fix"
+        else:
+            custom_vae = None
+        vae_model = VaeModel(hf_model_name, custom_vae=custom_vae)
+
+    if dtype == torch.float16:
+        vae_model = vae_model.half()
     mapper = {}
-    decomp_list = DEFAULT_DECOMPOSITIONS
-    if decomp_attn:
-        decomp_list.extend(
-            [
-                torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
-                torch.ops.aten._scaled_dot_product_flash_attention.default,
-            ]
-        )
-    dtype = torch.float16 if precision == "fp16" else torch.float32
-    vae_model = vae_model.to(dtype)
     utils.save_external_weights(
         mapper, vae_model, external_weights, external_weight_path
     )
     if weights_only:
         return external_weight_path
-    sample = (batch_size, 4, height // 8, width // 8)
-    if variant == "encode":
-        sample = (batch_size, 3, height, width)
 
-    class CompiledVae(CompiledModule):
-        params = export_parameters(vae_model)
+    input_image_shape = (height, width, 3)
+    input_latents_shape = (batch_size, num_channels, height // 8, width // 8)
+    encode_args = [
+        torch.empty(
+            input_image_shape,
+            dtype=torch.float32,
+        )
+    ]
+    decode_args = [
+        torch.empty(
+            input_latents_shape,
+            dtype=dtype,
+        )
+    ]
+    decomp_list = []
+    if decomp_attn == True:
+        decomp_list = [
+            torch.ops.aten._scaled_dot_product_flash_attention_for_cpu,
+            torch.ops.aten._scaled_dot_product_flash_attention.default,
+            torch.ops.aten.scaled_dot_product_attention,
+        ]
+    with decompositions.extend_aot_decompositions(
+        from_current=True,
+        add_ops=decomp_list,
+    ):
+        fxb = FxProgramsBuilder(vae_model)
+
+        # TODO: fix issues with exporting the encode function.
+        # @fxb.export_program(args=(encode_args,))
+        # def _encode(module, inputs,):
+        #     return module.encode(*inputs)
+
+        @fxb.export_program(args=(decode_args,))
+        def _decode(module, inputs):
+            return module.decode(*inputs)
+
+        class CompiledVae(CompiledModule):
+            decode = _decode
+
+        if external_weights:
+            externalize_module_parameters(vae_model)
+
+        inst = CompiledVae(context=Context(), import_to="IMPORT")
 
-        def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
-            if variant == "decode":
-                return jittable(vae_model.decode_inp, decompose_ops=decomp_list)(inp)
-            elif variant == "encode":
-                return jittable(vae_model.encode_inp, decompose_ops=decomp_list)(inp)
+        module = CompiledModule.get_mlir_module(inst)
 
-    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
-    inst = CompiledVae(context=Context(), import_to=import_to)
+    model_metadata_decode = {
+        "model_name": "vae_decode",
+        "input_shapes": [input_latents_shape],
+        "input_dtypes": [np_dtype],
+        "output_shapes": [(3, width, height) * batch_size],
+        "output_dtypes": ["float32"],
+    }
+    model_metadata_encode = {
+        "model_name": "vae_encode",
+        "input_shapes": [input_image_shape],
+        "input_dtypes": [np_dtype],
+        "output_shapes": [input_latents_shape],
+        "output_dtypes": [np_dtype],
+    }
+    module = AddMetadataPass(module, model_metadata_decode, "decode").run()
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
     if compile_to != "vmfb":
-        return module_str
+        return str(module)
     else:
         vmfb_path = utils.compile_to_vmfb(
-            module_str,
+            str(module),
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             return_path=not exit_on_vmfb,
@@ -161,7 +242,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
     else:
         vae_model = VaeModel(
             args.hf_model_name,
-            custom_vae=custom_vae,
+            custom_vae=None,
         )
     mod_str = export_vae_model(
         vae_model,
@@ -174,7 +255,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=dtype)):
         external_weights=args.external_weights,
         external_weight_path=args.external_weight_path,
         device=args.device,
-        target_triple=args.iree_target_triple,
+        target=args.iree_target_triple,
         ireec_flags=args.ireec_flags + args.attn_flags + args.vae_flags,
         variant=args.vae_variant,
         decomp_attn=args.decomp_attn,
diff --git a/models/turbine_models/custom_models/sd_inference/vae_runner.py b/models/turbine_models/custom_models/sd_inference/vae_runner.py
index cded33824..166021631 100644
--- a/models/turbine_models/custom_models/sd_inference/vae_runner.py
+++ b/models/turbine_models/custom_models/sd_inference/vae_runner.py
@@ -5,17 +5,19 @@
 import torch
 
 
-def run_vae(device, example_input, vmfb_path, hf_model_name, external_weight_path):
+def run_vae_decode(
+    device, example_input, vmfb_path, hf_model_name, external_weight_path
+):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
 
     inputs = [ireert.asdevicearray(runner.config.device, example_input)]
 
-    results = runner.ctx.modules.compiled_vae["main"](*inputs).to_host()
+    results = runner.ctx.modules.compiled_vae["decode"](*inputs).to_host()
 
     return results
 
 
-def run_torch_vae(hf_model_name, variant, example_input):
+def run_torch_vae_decode(hf_model_name, variant, example_input):
     from diffusers import AutoencoderKL
 
     class VaeModel(torch.nn.Module):
@@ -87,7 +89,7 @@ def encode_inp(self, inp):
             args.batch_size, 3, args.height, args.width, dtype=torch.float32
         )
     print("generating turbine output:")
-    turbine_results = run_vae(
+    turbine_results = run_vae_decode(
         args.device,
         example_input,
         args.vmfb_path,
@@ -104,7 +106,7 @@ def encode_inp(self, inp):
         print("generating torch output: ")
         from turbine_models.custom_models.sd_inference import utils
 
-        torch_output = run_torch_vae(
+        torch_output = run_torch_vae_decode(
             args.hf_model_name, args.hf_auth_token, args.variant, example_input
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 224e63233..00b02d028 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -12,6 +12,8 @@
 from iree.compiler.ir import Context
 import numpy as np
 from shark_turbine.aot import *
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
+
 from turbine_models.custom_models.sd_inference import utils
 import torch
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
@@ -155,29 +157,27 @@ def export_prompt_encoder(
     hf_model_name,
     hf_auth_token=None,
     max_length=64,
+    batch_size=1,
     precision="fp16",
     compile_to="torch",
     external_weights=None,
     external_weight_path=None,
     device=None,
-    target_triple=None,
+    target=None,
     ireec_flags=None,
-    exit_on_vmfb=True,
+    exit_on_vmfb=False,
     pipeline_dir=None,
     input_mlir=None,
     attn_spec=None,
     weights_only=False,
-    batchsize=1,
     batch_input=False,
+    decomp_attn=False,  # Compatibility
 ):
-    if "turbo" in hf_model_name:
-        do_classifier_free_guidance = False
-    else:
-        do_classifier_free_guidance = True
+    do_classifier_free_guidance = True
 
     safe_name = utils.create_safe_name(
         hf_model_name,
-        f"_bs{batchsize}_{str(max_length)}-{precision}-prompt-encoder-{device}",
+        f"_bs{batch_size}_{str(max_length)}-{precision}-prompt-encoder-{device}",
     )
     if pipeline_dir not in [None, ""]:
         safe_name = os.path.join(pipeline_dir, safe_name)
@@ -186,9 +186,9 @@ def export_prompt_encoder(
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
-            target_triple,
+            target,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
@@ -214,7 +214,7 @@ def export_prompt_encoder(
         precision,
         hf_auth_token,
         do_classifier_free_guidance,
-        batch_size=batchsize,
+        batch_size=batch_size,
         batch_input=batch_input,
     )
 
@@ -265,22 +265,31 @@ def encode_prompts_turbo(
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
     inst = CompiledClip(context=Context(), import_to=import_to)
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
+    module = CompiledModule.get_mlir_module(inst)
+
+    model_metadata_encode = {
+        "model_name": hf_model_name + "_text_encoder",
+        "input_shapes": [str((1, max_length)) for i in range(4)],
+        "input_dtypes": ["int64" for i in range(4)],
+        "use_attention_mask": False,
+    }
+    module = AddMetadataPass(module, model_metadata_encode, "encode_prompts").run()
+    module_str = str(module)
 
     if compile_to != "vmfb":
-        return module_str, tokenizers
+        return module_str
     else:
         vmfb_path = utils.compile_to_vmfb(
             module_str,
             device,
-            target_triple,
+            target,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
             attn_spec=attn_spec,
         )
-        return module_str, vmfb_path
+        return vmfb_path
 
 
 if __name__ == "__main__":
@@ -290,6 +299,7 @@ def encode_prompts_turbo(
         args.hf_model_name,
         args.hf_auth_token,
         args.max_length,
+        args.batch_size,
         args.precision,
         args.compile_to,
         args.external_weights,
@@ -301,7 +311,6 @@ def encode_prompts_turbo(
         pipeline_dir=args.pipeline_dir,
         input_mlir=args.input_mlir,
         attn_spec=args.attn_spec,
-        batchsize=args.batch_size,
     )
     if args.input_mlir:
         exit()
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 6b45ab799..4d3af598c 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -7,22 +7,24 @@
 import copy
 import os
 import sys
+import safetensors
 
 from iree import runtime as ireert
 from iree.compiler.ir import Context
 import numpy as np
 from shark_turbine.aot import *
-from shark_turbine.dynamo.passes import (
-    DEFAULT_DECOMPOSITIONS,
-)
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
+
+
 from turbine_models.custom_models.sd_inference import utils
 import torch
-import torch._dynamo as dynamo
-from diffusers import UNet2DConditionModel
+from huggingface_hub import hf_hub_download
 
 
 class UnetModel(torch.nn.Module):
     def __init__(self, hf_model_name, hf_auth_token=None, precision="fp32"):
+        from diffusers import UNet2DConditionModel
+
         super().__init__()
         if precision == "fp16":
             try:
@@ -47,18 +49,23 @@ def __init__(self, hf_model_name, hf_auth_token=None, precision="fp32"):
                 auth_token=hf_auth_token,
                 low_cpu_mem_usage=False,
             )
-        # if "turbo" in hf_model_name:
-        #     self.do_classifier_free_guidance = False
-        # else:
         self.do_classifier_free_guidance = True
 
     def forward(
-        self, latent_model_input, timestep, prompt_embeds, text_embeds, time_ids
+        self,
+        latent_model_input,
+        timestep,
+        prompt_embeds,
+        text_embeds,
+        time_ids,
+        guidance_scale,
     ):
         added_cond_kwargs = {
             "text_embeds": text_embeds,
             "time_ids": time_ids,
         }
+        if self.do_classifier_free_guidance:
+            latent_model_input = torch.cat([latent_model_input] * 2)
         noise_pred = self.unet.forward(
             latent_model_input,
             timestep,
@@ -67,12 +74,92 @@ def forward(
             added_cond_kwargs=added_cond_kwargs,
             return_dict=False,
         )[0]
+        if self.do_classifier_free_guidance:
+            noise_preds = noise_pred.chunk(2)
+            noise_pred = noise_preds[0] + guidance_scale * (
+                noise_preds[1] - noise_preds[0]
+            )
         return noise_pred
 
 
+def get_punet_model(hf_model_name, external_weight_path, precision="i8"):
+    from sharktank.models.punet.model import (
+        Unet2DConditionModel as sharktank_unet2d,
+        ClassifierFreeGuidanceUnetModel as sharktank_CFGPunetModel,
+    )
+    from sharktank.utils import cli
+
+    if precision == "i8":
+        repo_id = "amd-shark/sdxl-quant-models"
+        subfolder = "unet/int8"
+        revision = "82e06d6ea22ac78102a9aded69e8ddfb9fa4ae37"
+    elif precision in ["fp16", "fp32"]:
+        repo_id = hf_model_name
+        subfolder = "unet"
+        revision = "76d28af79639c28a79fa5c6c6468febd3490a37e"
+
+    def download(filename):
+        return hf_hub_download(
+            repo_id=repo_id, subfolder=subfolder, filename=filename, revision=revision
+        )
+
+    results = {
+        "config.json": download("config.json"),
+        "params.safetensors": download("params.safetensors"),
+    }
+    output_dir = os.path.dirname(external_weight_path)
+
+    if precision == "i8":
+        results["quant_params.json"] = download("quant_params.json")
+        ds_filename = (
+            os.path.basename(external_weight_path).split("unet")[0]
+            + "punet_dataset_i8.irpa"
+        )
+        output_path = os.path.join(output_dir, ds_filename)
+        ds = get_punet_dataset(
+            results["config.json"],
+            results["params.safetensors"],
+            output_path,
+            results["quant_params.json"],
+        )
+    else:
+        ds_filename = (
+            os.path.basename(external_weight_path).split("unet")[0]
+            + f"punet_dataset_{precision}.irpa"
+        )
+        output_path = os.path.join(output_dir, ds_filename)
+        ds = get_punet_dataset(
+            results["config.json"],
+            results["params.safetensors"],
+            output_path,
+        )
+
+    cond_unet = sharktank_unet2d.from_dataset(ds)
+    mdl = sharktank_CFGPunetModel(cond_unet)
+    return mdl
+
+
+def get_punet_dataset(
+    config_json_path,
+    params_path,
+    output_path,
+    quant_params_path=None,
+):
+    from sharktank.models.punet.tools import import_brevitas_dataset
+
+    ds_import_args = [
+        f"--config-json={config_json_path}",
+        f"--params={params_path}",
+        f"--output-irpa-file={output_path}",
+    ]
+    if quant_params_path:
+        ds_import_args.extend([f"--quant-params={quant_params_path}"])
+    import_brevitas_dataset.main(ds_import_args)
+    return import_brevitas_dataset.Dataset.load(output_path)
+
+
 @torch.no_grad()
 def export_unet_model(
-    unet_model,
     hf_model_name,
     batch_size,
     height,
@@ -84,7 +171,7 @@ def export_unet_model(
     external_weights=None,
     external_weight_path=None,
     device=None,
-    target_triple=None,
+    target=None,
     ireec_flags=None,
     decomp_attn=False,
     exit_on_vmfb=False,
@@ -92,10 +179,21 @@ def export_unet_model(
     attn_spec=None,
     input_mlir=None,
     weights_only=False,
+    use_punet=False,
 ):
+    if use_punet:
+        submodel_name = "punet"
+    else:
+        submodel_name = "unet"
+    if (not decomp_attn) and use_punet:
+        attn_spec = "punet"
+    elif (not decomp_attn) and "gfx9" in target:
+        attn_spec = "mfma"
+    elif (not decomp_attn) and "gfx11" in target:
+        attn_spec = "wmma"
     safe_name = utils.create_safe_name(
         hf_model_name,
-        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_unet",
+        f"_bs{batch_size}_{max_length}_{height}x{width}_{precision}_{submodel_name}",
     )
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)
@@ -107,24 +205,43 @@ def export_unet_model(
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
-            target_triple,
+            target,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
         )
         return vmfb_path
+    elif use_punet:
+        unet_model = get_punet_model(hf_model_name, external_weight_path, precision)
+    else:
+        unet_model = UnetModel(hf_model_name, hf_auth_token, precision)
 
     mapper = {}
-    dtype = torch.float16 if precision == "fp16" else torch.float32
+    np_dtypes = {
+        "fp16": "float16",
+        "fp32": "float32",
+        "i8": "int8",
+    }
+    torch_dtypes = {
+        "fp16": torch.float16,
+        "fp32": torch.float32,
+        "i8": torch.int8,
+    }
+    dtype = torch_dtypes[precision]
+    np_dtype = np_dtypes[precision]
 
-    if precision == "fp16":
+    if precision == "fp16" and not use_punet:
         unet_model = unet_model.half()
 
-    utils.save_external_weights(
-        mapper, unet_model, external_weights, external_weight_path
-    )
+    if use_punet:
+        dtype = torch.float16
+
+    if not use_punet:
+        utils.save_external_weights(
+            mapper, unet_model, external_weights, external_weight_path
+        )
 
     if weights_only:
         return external_weight_path
@@ -132,24 +249,32 @@ def export_unet_model(
     do_classifier_free_guidance = True
     init_batch_dim = 2 if do_classifier_free_guidance else 1
 
-    prepared_latents = (
-        batch_size * init_batch_dim,
-        unet_model.unet.config.in_channels,
+    sample = [
+        batch_size,
+        4,
         height // 8,
         width // 8,
-    )
+    ]
 
     time_ids_shape = (init_batch_dim * batch_size, 6)
     prompt_embeds_shape = (init_batch_dim * batch_size, max_length, 2048)
     text_embeds_shape = (init_batch_dim * batch_size, 1280)
     example_forward_args = [
-        torch.empty(prepared_latents, dtype=dtype),
+        torch.empty(sample, dtype=dtype),
         torch.empty(1, dtype=dtype),
         torch.empty(prompt_embeds_shape, dtype=dtype),
         torch.empty(text_embeds_shape, dtype=dtype),
         torch.empty(time_ids_shape, dtype=dtype),
+        torch.tensor([7.5], dtype=dtype),
     ]
-
+    example_forward_args_dict = {
+        "sample": torch.rand(sample, dtype=dtype),
+        "timestep": torch.zeros(1, dtype=dtype),
+        "encoder_hidden_states": torch.rand(prompt_embeds_shape, dtype=dtype),
+        "text_embeds": torch.rand(text_embeds_shape, dtype=dtype),
+        "time_ids": torch.zeros(time_ids_shape, dtype=dtype),
+        "guidance_scale": torch.tensor([7.5], dtype=dtype),
+    }
     decomp_list = []
     if decomp_attn == True:
         decomp_list = [
@@ -161,36 +286,61 @@ def export_unet_model(
         from_current=True,
         add_ops=decomp_list,
     ):
-        fxb = FxProgramsBuilder(unet_model)
+        if use_punet:
+            output = export(
+                unet_model,
+                kwargs=example_forward_args_dict,
+                module_name="compiled_punet",
+            )
+            module = output.mlir_module
+        else:
+            if external_weights:
+                externalize_module_parameters(unet_model)
+            fxb = FxProgramsBuilder(unet_model)
 
-        @fxb.export_program(
-            args=(example_forward_args,),
-        )
-        def _forward(
-            module,
-            inputs,
-        ):
-            return module.forward(*inputs)
+            @fxb.export_program(
+                args=(example_forward_args,),
+            )
+            def _forward(
+                module,
+                inputs,
+            ):
+                return module.forward(*inputs)
+
+            class CompiledUnet(CompiledModule):
+                run_forward = _forward
 
-        class CompiledUnet(CompiledModule):
-            run_forward = _forward
+            inst = CompiledUnet(context=Context(), import_to="IMPORT")
 
-        if external_weights:
-            externalize_module_parameters(unet_model)
+            module = CompiledModule.get_mlir_module(inst)
 
-        inst = CompiledUnet(context=Context(), import_to="IMPORT")
+    model_metadata_run_forward = {
+        "model_name": "sd_unet",
+        "input_shapes": [
+            sample,
+            (1,),
+            prompt_embeds_shape,
+            text_embeds_shape,
+            time_ids_shape,
+            (1,),
+        ],
+        "input_dtypes": [np_dtype for x in range(6)],
+        "output_shapes": [sample],
+        "output_dtypes": [np_dtype],
+    }
 
-        module_str = str(CompiledModule.get_mlir_module(inst))
+    module = AddMetadataPass(module, model_metadata_run_forward, "run_forward").run()
 
+    module_str = str(module)
     if compile_to != "vmfb":
         return module_str
     else:
         vmfb_path = utils.compile_to_vmfb(
             module_str,
             device,
-            target_triple,
+            target,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             return_path=True,
             attn_spec=attn_spec,
         )
@@ -236,7 +386,7 @@ class CompiledUnet(CompiledModule):
         exit()
     safe_name = utils.create_safe_name(
         args.hf_model_name,
-        f"_bs{args.batch_size}_{args.max_length}_{args.height}x{args.width}_{args.precision}_unet",
+        f"_bs{args.batch_size}_{args.max_length}_{args.height}x{args.width}_{args.precision}_{'p' if args.use_i8_punet else ''}unet",
     )
     if args.compile_to != "vmfb":
         with open(f"{safe_name}.mlir", "w+") as f:
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
index 9d0b405c3..c474982d7 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -31,6 +31,7 @@ def run_unet(
         ireert.asdevicearray(runner.config.device, prompt_embeds),
         ireert.asdevicearray(runner.config.device, text_embeds),
         ireert.asdevicearray(runner.config.device, time_ids),
+        ireert.asdevicearray(runner.config.device, guidance_scale),
     ]
     results = runner.ctx.modules.compiled_unet["run_forward"](*inputs)
 
@@ -56,6 +57,7 @@ def run_unet_steps(
         ireert.asdevicearray(runner.config.device, prompt_embeds),
         ireert.asdevicearray(runner.config.device, text_embeds),
         ireert.asdevicearray(runner.config.device, time_ids),
+        ireert.asdevicearray(runner.config.device, guidance_scale),
     ]
     for i, t in tqdm(enumerate(scheduler.timesteps)):
         timestep = t
@@ -116,7 +118,7 @@ def run_torch_unet(
     sample = torch.rand(
         args.batch_size, 4, args.height // 8, args.width // 8, dtype=dtype
     )
-    timestep = torch.ones(1, dtype=torch.int64)
+    timestep = torch.ones(1, dtype=dtype)
     prompt_embeds = torch.rand(2 * args.batch_size, args.max_length, 2048, dtype=dtype)
     text_embeds = torch.rand(2 * args.batch_size, 1280, dtype=dtype)
     time_ids = torch.rand(2 * args.batch_size, 6, dtype=dtype)
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae_runner.py b/models/turbine_models/custom_models/sdxl_inference/vae_runner.py
index 539c99868..01767d322 100644
--- a/models/turbine_models/custom_models/sdxl_inference/vae_runner.py
+++ b/models/turbine_models/custom_models/sdxl_inference/vae_runner.py
@@ -15,64 +15,22 @@ def run_vae(
 ):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
     inputs = [ireert.asdevicearray(runner.config.device, example_input)]
-    results = runner.ctx.modules.compiled_vae["main"](*inputs)
+    results = runner.ctx.modules.compiled_vae["decode"](*inputs)
 
     return results
 
 
 def run_torch_vae(hf_model_name, custom_vae, variant, example_input):
-    from diffusers import AutoencoderKL
-
-    class VaeModel(torch.nn.Module):
-        def __init__(
-            self,
-            hf_model_name,
-            custom_vae=custom_vae,
-        ):
-            super().__init__()
-            self.vae = None
-            if custom_vae in ["", None]:
-                self.vae = AutoencoderKL.from_pretrained(
-                    hf_model_name,
-                    subfolder="vae",
-                )
-            elif not isinstance(custom_vae, dict):
-                try:
-                    # custom HF repo with no vae subfolder
-                    self.vae = AutoencoderKL.from_pretrained(
-                        custom_vae,
-                    )
-                except:
-                    # some larger repo with vae subfolder
-                    self.vae = AutoencoderKL.from_pretrained(
-                        custom_vae,
-                        subfolder="vae",
-                    )
-            else:
-                # custom vae as a HF state dict
-                self.vae = AutoencoderKL.from_pretrained(
-                    hf_model_name,
-                    subfolder="vae",
-                )
-                self.vae.load_state_dict(custom_vae)
-
-        def decode_inp(self, inp):
-            inp = inp / 0.13025
-            x = self.vae.decode(inp, return_dict=False)[0]
-            return (x / 2 + 0.5).clamp(0, 1)
-
-        def encode_inp(self, inp):
-            latents = self.vae.encode(inp).latent_dist.sample()
-            return 0.13025 * latents
+    from turbine_models.custom_models.sd_inference.vae import VaeModel
 
     vae_model = VaeModel(
         hf_model_name,
     )
 
     if variant == "decode":
-        results = vae_model.decode_inp(example_input)
+        results = vae_model.decode(example_input)
     elif variant == "encode":
-        results = vae_model.encode_inp(example_input)
+        results = vae_model.encode(example_input)
     np_torch_output = results.detach().cpu().numpy()
     return np_torch_output
 
diff --git a/models/turbine_models/tests/conftest.py b/models/turbine_models/tests/conftest.py
index 1c1952605..d93aa2e60 100644
--- a/models/turbine_models/tests/conftest.py
+++ b/models/turbine_models/tests/conftest.py
@@ -18,7 +18,7 @@ def pytest_addoption(parser):
         action="store",
         default="blurry, unsaturated, watermark, noisy, grainy, out of focus",
     )
-    parser.addoption("--num_inference_steps", type=int, action="store", default=5)
+    parser.addoption("--num_inference_steps", type=int, action="store", default=2)
     parser.addoption("--guidance_scale", type=float, action="store", default=7.5)
     parser.addoption("--seed", type=float, action="store", default=0.0)
     parser.addoption("--vmfb_path", action="store", default="")
@@ -50,4 +50,4 @@ def pytest_addoption(parser):
     parser.addoption("--in_channels", type=int, action="store", default=4)
     parser.addoption("--benchmark", action="store_true", default=False)
     parser.addoption("--tracy_profile", action="store_true", default=False)
-    parser.addoption("--compiled_pipeline", type=bool, default=True)
+    parser.addoption("--compiled_pipeline", type=bool, default=False)
diff --git a/models/turbine_models/tests/pipeline_test.py b/models/turbine_models/tests/pipeline_test.py
index 76e33c96a..658402652 100644
--- a/models/turbine_models/tests/pipeline_test.py
+++ b/models/turbine_models/tests/pipeline_test.py
@@ -85,9 +85,9 @@ class CompiledTester(CompiledModule):
 class TestPipeline(TurbinePipelineBase):
     def __init__(
         self,
-        **kwargs,
+        **base_args,
     ):
-        super().__init__(**kwargs)
+        super().__init__(**base_args)
 
     def run(self, inputs: list):
         return self.test_model_1("forward", *inputs)
@@ -103,14 +103,12 @@ def setUp(self):
                 "safe_name": "TestModel2xLinear",
                 "keywords": ["Test", "Model", "2x", "Linear"],
                 "export_fn": export_dummy_model,
-                "export_args": None,
             }
         }
         self.pipe = TestPipeline(
             model_map=model_map,
-            batch_size=1,
             device="cpu",
-            iree_target_triple="x86_64-unknown-linux-gnu",
+            target="x86_64-unknown-linux-gnu",
             pipeline_dir="./",
             precision="fp32",
         )
diff --git a/models/turbine_models/tests/sd3_test.py b/models/turbine_models/tests/sd3_test.py
index 95309947d..a627eb287 100644
--- a/models/turbine_models/tests/sd3_test.py
+++ b/models/turbine_models/tests/sd3_test.py
@@ -354,191 +354,54 @@ def test03_ExportVaeModelDecode(self):
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
+    @pytest.mark.skip("Waiting on inference plumbing for generalized sd pipeline")
+    def test04SDPipeline(self):
+        from turbine_models.custom_models.sd_inference.sd_pipeline import (
+            SharkSDPipeline,
+        )
 
-#    def test04_ExportVaeModelEncode(self):
-#        if arguments["device"] in ["cpu", "vulkan", "cuda", "rocm"]:
-#            self.skipTest(
-#                "Compilation error on cpu, vulkan and rocm; To be tested on cuda."
-#            )
-#        vae.export_vae_model(
-#            vae_model=self.vae_model,
-#            # This is a public model, so no auth required
-#            hf_model_name=arguments["hf_model_name"],
-#            batch_size=arguments["batch_size"],
-#            height=arguments["height"],
-#            width=arguments["width"],
-#            precision=arguments["precision"],
-#            compile_to="vmfb",
-#            external_weights=arguments["external_weights"],
-#            external_weight_path=self.safe_model_name
-#            + "_"
-#            + arguments["precision"]
-#            + "_vae_encode."
-#            + arguments["external_weights"],
-#            device=arguments["device"],
-#            target_triple=arguments["iree_target_triple"],
-#            ireec_flags=arguments["ireec_flags"],
-#            variant="encode",
-#            decomp_attn=arguments["decomp_attn"],
-#            exit_on_vmfb=True,
-#        )
-#        arguments["external_weight_path"] = (
-#            self.safe_model_name
-#            + "_"
-#            + arguments["precision"]
-#            + "_vae_encode."
-#            + arguments["external_weights"]
-#        )
-#        arguments["vmfb_path"] = (
-#            self.safe_model_name
-#            + "_"
-#            + str(arguments["height"])
-#            + "x"
-#            + str(arguments["width"])
-#            + "_"
-#            + arguments["precision"]
-#            + "_vae_encode_"
-#            + arguments["device"]
-#            + ".vmfb"
-#        )
-#        example_input = torch.ones(
-#            arguments["batch_size"],
-#            3,
-#            arguments["height"],
-#            arguments["width"],
-#            dtype=torch.float32,
-#        )
-#        example_input_torch = example_input
-#        if arguments["precision"] == "fp16":
-#            example_input = example_input.half()
-#        turbine = vae_runner.run_vae(
-#            arguments["rt_device"],
-#            example_input,
-#            arguments["vmfb_path"],
-#            arguments["hf_model_name"],
-#            arguments["external_weight_path"],
-#        )
-#        torch_output = vae_runner.run_torch_vae(
-#            arguments["hf_model_name"],
-#            (
-#                "madebyollin/sdxl-vae-fp16-fix"
-#                if arguments["precision"] == "fp16"
-#                else ""
-#            ),
-#            "encode",
-#            example_input_torch,
-#        )
-#        if arguments["benchmark"] or arguments["tracy_profile"]:
-#            run_benchmark(
-#                "vae_encode",
-#                arguments["vmfb_path"],
-#                arguments["external_weight_path"],
-#                arguments["rt_device"],
-#                height=arguments["height"],
-#                width=arguments["width"],
-#                precision=arguments["precision"],
-#                tracy_profile=arguments["tracy_profile"],
-#            )
-#        rtol = 4e-2
-#        atol = 4e-2
-#        np.testing.assert_allclose(torch_output, turbine, rtol, atol)
+        current_args = copy.deepcopy(default_arguments)
+        decomp_attn = {
+            "text_encoder": False,
+            "unet": False,
+            "vae": current_args["vae_decomp_attn"],
+        }
+        sd_pipe = SharkSDPipeline(
+            current_args["hf_model_name"],
+            current_args["height"],
+            current_args["width"],
+            current_args["batch_size"],
+            current_args["max_length"],
+            current_args["precision"],
+            current_args["device"],
+            current_args["iree_target_triple"],
+            ireec_flags=None,  # ireec_flags
+            attn_spec=current_args["attn_spec"],
+            decomp_attn=decomp_attn,
+            pipeline_dir="test_vmfbs",  # pipeline_dir
+            external_weights_dir="test_weights",  # external_weights_dir
+            external_weights=current_args["external_weights"],
+            num_inference_steps=current_args["num_inference_steps"],
+            cpu_scheduling=True,
+            scheduler_id=current_args["scheduler_id"],
+            shift=None,  # shift
+            use_i8_punet=False,
+        )
+        sd_pipe.prepare_all()
+        sd_pipe.load_map()
+        output = sd_pipe.generate_images(
+            current_args["prompt"],
+            current_args["negative_prompt"],
+            current_args["num_inference_steps"],
+            1,  # batch count
+            current_args["guidance_scale"],
+            current_args["seed"],
+            current_args["cpu_scheduling"],
+            current_args["scheduler_id"],
+            True,  # return_img
+        )
+        assert output is not None
 
-#    def test05_t2i_generate_images(self):
-#        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
-#            self.skipTest(
-#                "Have issues with submodels on vulkan, cuda; ROCM hangs on mi250 despite submodels working."
-#            )
-#        mlirs = {
-#            "vae_decode": None,
-#            "prompt_encoder": None,
-#            "scheduled_unet": None,
-#            "pipeline": None,
-#            "full_pipeline": None,
-#        }
-#        vmfbs = {
-#            "vae_decode": None,
-#            "prompt_encoder": None,
-#            "scheduled_unet": None,
-#            "pipeline": None,
-#            "full_pipeline": None,
-#        }
-#        weights = {
-#            "vae_decode": None,
-#            "prompt_encoder": None,
-#            "scheduled_unet": None,
-#            "pipeline": None,
-#            "full_pipeline": None,
-#        }
-#
-#        if not arguments["pipeline_dir"]:
-#            pipe_id_list = [
-#                "sdxl_1_0",
-#                str(arguments["height"]),
-#                str(arguments["width"]),
-#                str(arguments["max_length"]),
-#                arguments["precision"],
-#                arguments["device"],
-#            ]
-#            arguments["pipeline_dir"] = os.path.join(
-#                ".",
-#                "_".join(pipe_id_list),
-#            )
-#        ireec_flags = {
-#            "unet": arguments["ireec_flags"],
-#            "vae": arguments["ireec_flags"],
-#            "clip": arguments["ireec_flags"],
-#            "pipeline": arguments["ireec_flags"],
-#        }
-#        user_mlir_list = []
-#        for submodel_id, mlir_path in zip(mlirs.keys(), user_mlir_list):
-#            if submodel_id in mlir_path:
-#                mlirs[submodel_id] = mlir_path
-#        external_weights_dir = arguments["pipeline_dir"]
-#        sdxl_pipe = sdxl_compiled_pipeline.SharkSDXLPipeline(
-#            arguments["hf_model_name"],
-#            arguments["scheduler_id"],
-#            arguments["height"],
-#            arguments["width"],
-#            arguments["precision"],
-#            arguments["max_length"],
-#            arguments["batch_size"],
-#            arguments["num_inference_steps"],
-#            arguments["device"],
-#            arguments["iree_target_triple"],
-#            ireec_flags,
-#            arguments["attn_spec"],
-#            arguments["decomp_attn"],
-#            arguments["pipeline_dir"],
-#            external_weights_dir,
-#            arguments["external_weights"],
-#        )
-#        vmfbs, weights = sdxl_pipe.check_prepared(
-#            mlirs, vmfbs, weights, interactive=False
-#        )
-#        sdxl_pipe.load_pipeline(
-#            vmfbs, weights, arguments["rt_device"], arguments["compiled_pipeline"]
-#        )
-#        sdxl_pipe.generate_images(
-#            arguments["prompt"],
-#            arguments["negative_prompt"],
-#            1,
-#            arguments["guidance_scale"],
-#            arguments["seed"],
-#        )
-#        print("Image generation complete.")
-#        os.remove(os.path.join(arguments["pipeline_dir"], "prompt_encoder.vmfb"))
-#        os.remove(
-#            os.path.join(
-#                arguments["pipeline_dir"],
-#                arguments["scheduler_id"]
-#                + "_unet_"
-#                + str(arguments["num_inference_steps"])
-#                + ".vmfb",
-#            )
-#        )
-#        os.remove(os.path.join(arguments["pipeline_dir"], "vae_decode.vmfb"))
-#        os.remove(os.path.join(arguments["pipeline_dir"], "full_pipeline.vmfb"))
-#
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.DEBUG)
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index 7af7dcb10..738738702 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -23,6 +23,7 @@
 import os
 import copy
 import platform
+from PIL import Image
 from turbine_models.turbine_tank import turbine_tank
 
 
@@ -30,8 +31,8 @@
     "hf_auth_token": None,
     "hf_model_name": "CompVis/stable-diffusion-v1-4",
     "safe_model_name": "stable-diffusion_v1_4",
-    "scheduler_id": "PNDM",
-    "num_inference_steps": 5,
+    "scheduler_id": "EulerDiscrete",
+    "num_inference_steps": 2,
     "batch_size": 1,
     "height": 512,
     "width": 512,
@@ -47,91 +48,39 @@
     "rt_device": "local-task",
     "iree_target_triple": "x86_64-linux-gnu",
     "prompt": "a photograph of an astronaut riding a horse",
+    "negative_prompt": "blurry, out of focus",
     "in_channels": 4,
+    "vae_decomp_attn": True,
+    "seed": 0,
+    "use_i8_punet": False,
+    "attn_spec": None,
+    "cpu_scheduling": True,
 }
 UPLOAD_IR = os.environ.get("TURBINE_TANK_ACTION", "not_upload") == "upload"
 
 
-unet_model = unet.UnetModel(
-    # This is a public model, so no auth required
-    default_arguments["hf_model_name"],
-)
-
-vae_model = vae.VaeModel(
-    # This is a public model, so no auth required
-    default_arguments["hf_model_name"],
-    custom_vae=None,
-)
-
-scheduler = schedulers.get_scheduler(
-    default_arguments["hf_model_name"], default_arguments["scheduler_id"]
-)
-scheduler_module = schedulers.SchedulingModel(
-    scheduler,
-    default_arguments["height"],
-    default_arguments["width"],
-    default_arguments["num_inference_steps"],
-    default_arguments["precision"],
-)
-
-
 # TODO: this is a mess, don't share args across tests, create a copy for each test
 class StableDiffusionTest(unittest.TestCase):
-    def testExportT5Model(self):
+    def testExportClipModel(self):
         current_args = copy.deepcopy(default_arguments)
-        current_args["hf_model_name"] = "google/t5-v1_1-small"
-        blob_name = clip.export_clip_model(
-            hf_model_name=current_args["hf_model_name"],
-            max_length=64,
-            precision=current_args["precision"],
-            compile_to="vmfb",
-            external_weights=None,
-            external_weight_path=None,
-            device="cpu",
-            target_triple=None,
-            exit_on_vmfb=False,
-            upload_ir=UPLOAD_IR,
-        )
-        current_args["vmfb_path"] = blob_name
-        turbine = clip_runner.run_clip(
-            current_args["rt_device"],
-            current_args["prompt"],
-            current_args["vmfb_path"],
-            current_args["hf_model_name"],
-            current_args["hf_auth_token"],
-            None,
-        )
-        torch_output = clip_runner.run_torch_clip(
-            current_args["hf_model_name"],
-            current_args["hf_auth_token"],
-            current_args["prompt"],
+        current_args["hf_model_name"] = "CompVis/stable-diffusion-v1-4"
+        safe_prefix = utils.create_safe_name(
+            current_args["hf_model_name"].split("/")[-1], "clip"
         )
-        err = utils.largest_error(torch_output, turbine[0])
-        assert err < 9e-4
-        if UPLOAD_IR:
-            new_blob_name = blob_name.split(".")
-            new_blob_name = new_blob_name[0] + "-pass.mlir"
-            turbine_tank.changeBlobName(blob_name, new_blob_name)
-        del current_args
-
-    def testExportClipVitLarge14(self):
-        current_args = copy.deepcopy(default_arguments)
-        current_args["hf_model_name"] = "openai/clip-vit-large-patch14"
-        safe_prefix = "clip_vit_large_patch14"
         blob_name = clip.export_clip_model(
             hf_model_name=current_args["hf_model_name"],
-            max_length=64,
+            max_length=current_args["max_length"],
             precision=current_args["precision"],
             compile_to="vmfb",
             external_weights="safetensors",
             external_weight_path=safe_prefix + ".safetensors",
             device="cpu",
-            target_triple=None,
+            target=current_args["iree_target_triple"],
             exit_on_vmfb=False,
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = safe_prefix + ".safetensors"
-        current_args["vmfb_path"] = safe_prefix + "_clip.vmfb"
+        current_args["vmfb_path"] = blob_name
         turbine = clip_runner.run_clip(
             current_args["rt_device"],
             current_args["prompt"],
@@ -155,67 +104,26 @@ def testExportClipVitLarge14(self):
             os.remove(current_args["external_weight_path"])
             os.remove(current_args["vmfb_path"])
 
-    def testExportClipModel(self):
+    def testExportUnetModel(self):
         current_args = copy.deepcopy(default_arguments)
-        current_args["hf_model_name"] = "CompVis/stable-diffusion-v1-4"
-        blob_name = clip.export_clip_model(
+        blob_name = unet.export_unet_model(
             hf_model_name=current_args["hf_model_name"],
-            max_length=64,
+            batch_size=current_args["batch_size"],
+            height=current_args["height"],
+            width=current_args["width"],
             precision=current_args["precision"],
+            max_length=current_args["max_length"],
             compile_to="vmfb",
             external_weights="safetensors",
-            external_weight_path=safe_prefix + ".safetensors",
+            external_weight_path="stable_diffusion_unet.safetensors",
             device="cpu",
-            target_triple=None,
-            exit_on_vmfb=False,
-            upload_ir=UPLOAD_IR,
-        )
-        current_args["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors"
-        current_args["vmfb_path"] = "stable_diffusion_v1_4_clip.vmfb"
-        turbine = clip_runner.run_clip(
-            current_args["rt_device"],
-            current_args["prompt"],
-            current_args["vmfb_path"],
-            current_args["hf_model_name"],
-            current_args["hf_auth_token"],
-            current_args["external_weight_path"],
-        )
-        torch_output = clip_runner.run_torch_clip(
-            current_args["hf_model_name"],
-            current_args["hf_auth_token"],
-            current_args["prompt"],
-        )
-        err = utils.largest_error(torch_output, turbine[0])
-        assert err < 9e-5
-        if UPLOAD_IR:
-            new_blob_name = blob_name.split(".")
-            new_blob_name = new_blob_name[0] + "-pass.mlir"
-            turbine_tank.changeBlobName(blob_name, new_blob_name)
-        if platform.system() != "Windows":
-            os.remove(current_args["external_weight_path"])
-            os.remove(current_args["vmfb_path"])
-
-    def testExportUnetModel(self):
-        current_args = copy.deepcopy(default_arguments)
-        blob_name = unet.export_unet_model(
-            unet_model,
-            current_args["hf_model_name"],
-            current_args["batch_size"],
-            current_args["height"],
-            current_args["width"],
-            current_args["precision"],
-            current_args["max_length"],
-            None,
-            "vmfb",
-            "safetensors",
-            "stable_diffusion_unet.safetensors",
-            "cpu",
+            target=current_args["iree_target_triple"],
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = "stable_diffusion_unet.safetensors"
         current_args["vmfb_path"] = blob_name
         sample = torch.rand(
-            current_args["batch_size"],
+            current_args["batch_size"] * 2,
             current_args["in_channels"],
             current_args["height"] // 8,
             current_args["width"] // 8,
@@ -245,6 +153,7 @@ def testExportUnetModel(self):
             current_args["hf_model_name"],
             current_args["hf_auth_token"],
             current_args["external_weight_path"],
+            "float32",
         )
         torch_output = unet_runner.run_torch_unet(
             current_args["hf_model_name"],
@@ -268,17 +177,17 @@ def testExportUnetModel(self):
     def testExportVaeModelDecode(self):
         current_args = copy.deepcopy(default_arguments)
         blob_name = vae.export_vae_model(
-            vae_model,
-            current_args["hf_model_name"],
-            current_args["batch_size"],
-            current_args["height"],
-            current_args["width"],
-            current_args["precision"],
-            "vmfb",
-            "safetensors",
-            "stable_diffusion_v1_4_vae.safetensors",
-            "cpu",
-            variant="decode",
+            hf_model_name=current_args["hf_model_name"],
+            batch_size=current_args["batch_size"],
+            height=current_args["height"],
+            width=current_args["width"],
+            precision=current_args["precision"],
+            compile_to="vmfb",
+            external_weights="safetensors",
+            external_weight_path="stable_diffusion_v1_4_vae.safetensors",
+            device="cpu",
+            target=current_args["iree_target_triple"],
+            decomp_attn=current_args["vae_decomp_attn"],
             upload_ir=UPLOAD_IR,
         )
         current_args["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
@@ -290,14 +199,14 @@ def testExportVaeModelDecode(self):
             current_args["width"] // 8,
             dtype=torch.float32,
         )
-        turbine = vae_runner.run_vae(
+        turbine = vae_runner.run_vae_decode(
             current_args["rt_device"],
             example_input,
             current_args["vmfb_path"],
             current_args["hf_model_name"],
             current_args["external_weight_path"],
         )
-        torch_output = vae_runner.run_torch_vae(
+        torch_output = vae_runner.run_torch_vae_decode(
             current_args["hf_model_name"],
             "decode",
             example_input,
@@ -311,107 +220,54 @@ def testExportVaeModelDecode(self):
         del torch_output
         del turbine
         os.remove("stable_diffusion_v1_4_vae.safetensors")
-        os.remove("blob_name")
+        os.remove(blob_name)
 
-    def testExportVaeModelEncode(self):
-        current_args = copy.deepcopy(default_arguments)
-        blob_name = vae.export_vae_model(
-            vae_model,
-            current_args["hf_model_name"],
-            current_args["batch_size"],
-            current_args["height"],
-            current_args["width"],
-            current_args["precision"],
-            "vmfb",
-            "safetensors",
-            "stable_diffusion_v1_4_vae.safetensors",
-            "cpu",
-            variant="encode",
-            upload_ir=UPLOAD_IR,
+    def testSDPipeline(self):
+        from turbine_models.custom_models.sd_inference.sd_pipeline import (
+            SharkSDPipeline,
         )
-        current_args["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
-        current_args["vmfb_path"] = blob_name
-        example_input = torch.rand(
-            current_args["batch_size"],
-            3,
-            current_args["height"],
-            current_args["width"],
-            dtype=torch.float32,
-        )
-        turbine = vae_runner.run_vae(
-            current_args["rt_device"],
-            example_input,
-            current_args["vmfb_path"],
-            current_args["hf_model_name"],
-            current_args["external_weight_path"],
-        )
-        torch_output = vae_runner.run_torch_vae(
-            current_args["hf_model_name"],
-            "encode",
-            example_input,
-        )
-        err = utils.largest_error(torch_output, turbine)
-        assert err < 3e-3
-        if UPLOAD_IR:
-            new_blob_name = blob_name.split(".")
-            new_blob_name = new_blob_name[0] + "-pass.mlir"
-            turbine_tank.changeBlobName(blob_name, new_blob_name)
-        os.remove("stable_diffusion_v1_4_vae.safetensors")
-        os.remove(blob_name)
 
-    @unittest.expectedFailure
-    def testExportPNDMScheduler(self):
         current_args = copy.deepcopy(default_arguments)
-        safe_name = "stable_diffusion_v1_4_scheduler"
-        blob_name = schedulers.export_scheduler_model(
+        decomp_attn = {
+            "text_encoder": False,
+            "unet": False,
+            "vae": current_args["vae_decomp_attn"],
+        }
+        sd_pipe = SharkSDPipeline(
             current_args["hf_model_name"],
-            current_args["scheduler_id"],
-            current_args["batch_size"],
             current_args["height"],
             current_args["width"],
-            current_args["num_inference_steps"],
+            current_args["batch_size"],
+            current_args["max_length"],
             current_args["precision"],
-            "vmfb",
             current_args["device"],
             current_args["iree_target_triple"],
-            upload_ir=UPLOAD_IR,
-        )
-        current_args["external_weight_path"] = safe_name + ".safetensors"
-        current_args["vmfb_path"] = blob_name
-        sample = torch.rand(
-            current_args["batch_size"],
-            4,
-            current_args["height"] // 8,
-            current_args["width"] // 8,
-            dtype=torch.float32,
+            ireec_flags=None,  # ireec_flags
+            attn_spec=current_args["attn_spec"],
+            decomp_attn=decomp_attn,
+            pipeline_dir="test_vmfbs",  # pipeline_dir
+            external_weights_dir="test_weights",  # external_weights_dir
+            external_weights=current_args["external_weights"],
+            num_inference_steps=current_args["num_inference_steps"],
+            cpu_scheduling=True,
+            scheduler_id=current_args["scheduler_id"],
+            shift=None,  # shift
+            use_i8_punet=current_args["use_i8_punet"],
         )
-        encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32)
-        turbine = schedulers_runner.run_scheduler(
-            current_args["rt_device"],
-            sample,
-            encoder_hidden_states,
-            current_args["vmfb_path"],
-            current_args["hf_model_name"],
-            current_args["hf_auth_token"],
-            current_args["external_weight_path"],
-        )
-        torch_output = schedulers_runner.run_torch_scheduler(
-            current_args["hf_model_name"],
-            scheduler,
+        sd_pipe.prepare_all()
+        sd_pipe.load_map()
+        output = sd_pipe.generate_images(
+            current_args["prompt"],
+            current_args["negative_prompt"],
             current_args["num_inference_steps"],
-            sample,
-            encoder_hidden_states,
+            1,  # batch count
+            current_args["guidance_scale"],
+            current_args["seed"],
+            current_args["cpu_scheduling"],
+            current_args["scheduler_id"],
+            True,  # return_img
         )
-        err = utils.largest_error(torch_output, turbine)
-        assert err < 9e-3
-        if UPLOAD_IR:
-            new_blob_name = blob_name.split(".")
-            new_blob_name = new_blob_name[0] + "-pass.mlir"
-            turbine_tank.changeBlobName(blob_name, new_blob_name)
-        os.remove("stable_diffusion_v1_4_scheduler.safetensors")
-        os.remove("stable_diffusion_v1_4_scheduler.vmfb")
-        del torch_output
-        del turbine
+        assert output is not None
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index fa44673ac..da9dfdafe 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -9,7 +9,7 @@
 import torch
 from transformers import CLIPTokenizer
 from turbine_models.custom_models.sd_inference.utils import create_safe_name
-from turbine_models.custom_models.sd_inference import schedulers
+from turbine_models.custom_models.sd_inference import schedulers, vae
 from turbine_models.custom_models.sdxl_inference import (
     sdxl_prompt_encoder,
     sdxl_prompt_encoder_runner,
@@ -17,7 +17,6 @@
     unet_runner,
     sdxl_scheduled_unet,
     sdxl_scheduled_unet_runner,
-    vae,
     vae_runner,
     sdxl_compiled_pipeline,
 )
@@ -81,46 +80,26 @@ def command_line_args(request):
 class StableDiffusionXLTest(unittest.TestCase):
     def setUp(self):
         self.safe_model_name = create_safe_name(arguments["hf_model_name"], "")
-        self.unet_model = unet.UnetModel(
-            # This is a public model, so no auth required
-            arguments["hf_model_name"],
-            precision=arguments["precision"],
-        )
-        self.vae_model = vae.VaeModel(
-            # This is a public model, so no auth required
-            arguments["hf_model_name"],
-            custom_vae=(
-                "madebyollin/sdxl-vae-fp16-fix"
-                if arguments["precision"] == "fp16"
-                else None
-            ),
-        )
 
     def test01_ExportPromptEncoder(self):
-        if arguments["device"] in ["vulkan", "cuda"]:
+        if arguments["device"] in ["vulkan", "cuda", "rocm"]:
             self.skipTest(
-                "Compilation error on vulkan; Runtime error on rocm; To be tested on cuda."
+                "Compilation error on vulkan; recent numerics regression (nans) on hip driver, To be tested on cuda."
             )
         arguments["external_weight_path"] = (
             "prompt_encoder." + arguments["external_weights"]
         )
-        _, prompt_encoder_vmfb = sdxl_prompt_encoder.export_prompt_encoder(
+        prompt_encoder_vmfb = sdxl_prompt_encoder.export_prompt_encoder(
             arguments["hf_model_name"],
-            None,
-            arguments["max_length"],
-            arguments["precision"],
-            "vmfb",
-            "safetensors",
-            arguments["external_weight_path"],
-            arguments["device"],
-            arguments["iree_target_triple"],
-            arguments["ireec_flags"],
-            False,
-            None,
-            None,
-            arguments["attn_spec"],
-            False,
-            arguments["batch_size"],
+            hf_auth_token=None,
+            max_length=arguments["max_length"],
+            batch_size=arguments["batch_size"],
+            precision=arguments["precision"],
+            compile_to="vmfb",
+            external_weights="safetensors",
+            external_weight_path=arguments["external_weight_path"],
+            device=arguments["device"],
+            target=arguments["iree_target_triple"],
         )
         tokenizer_1 = CLIPTokenizer.from_pretrained(
             arguments["hf_model_name"],
@@ -177,9 +156,7 @@ def test01_ExportPromptEncoder(self):
     def test02_ExportUnetModel(self):
         if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("Unknown error on vulkan; To be tested on cuda.")
-        unet.export_unet_model(
-            unet_model=self.unet_model,
-            # This is a public model, so no auth required
+        unet_vmfb = unet.export_unet_model(
             hf_model_name=arguments["hf_model_name"],
             batch_size=arguments["batch_size"],
             height=arguments["height"],
@@ -195,10 +172,11 @@ def test02_ExportUnetModel(self):
             + "_unet."
             + arguments["external_weights"],
             device=arguments["device"],
-            target_triple=arguments["iree_target_triple"],
+            target=arguments["iree_target_triple"],
             ireec_flags=arguments["ireec_flags"],
             decomp_attn=arguments["decomp_attn"],
             attn_spec=arguments["attn_spec"],
+            exit_on_vmfb=False,
         )
         arguments["external_weight_path"] = (
             self.safe_model_name
@@ -207,20 +185,7 @@ def test02_ExportUnetModel(self):
             + "_unet."
             + arguments["external_weights"]
         )
-        arguments["vmfb_path"] = (
-            self.safe_model_name
-            + "_"
-            + str(arguments["max_length"])
-            + "_"
-            + str(arguments["height"])
-            + "x"
-            + str(arguments["width"])
-            + "_"
-            + arguments["precision"]
-            + "_unet_"
-            + arguments["device"]
-            + ".vmfb"
-        )
+        arguments["vmfb_path"] = unet_vmfb
         dtype = torch.float16 if arguments["precision"] == "fp16" else torch.float32
         sample = torch.rand(
             (
@@ -231,7 +196,7 @@ def test02_ExportUnetModel(self):
             ),
             dtype=dtype,
         )
-        timestep = torch.zeros(1, dtype=torch.int64)
+        timestep = torch.zeros(1, dtype=dtype)
         prompt_embeds = torch.rand(
             (2 * arguments["batch_size"], arguments["max_length"], 2048),
             dtype=dtype,
@@ -286,9 +251,7 @@ def test02_ExportUnetModel(self):
     def test03_ExportVaeModelDecode(self):
         if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("Compilation error on vulkan; To be tested on cuda.")
-        vae.export_vae_model(
-            vae_model=self.vae_model,
-            # This is a public model, so no auth required
+        vae_vmfb = vae.export_vae_model(
             hf_model_name=arguments["hf_model_name"],
             batch_size=arguments["batch_size"],
             height=arguments["height"],
@@ -302,12 +265,11 @@ def test03_ExportVaeModelDecode(self):
             + "_vae_decode."
             + arguments["external_weights"],
             device=arguments["device"],
-            target_triple=arguments["iree_target_triple"],
+            target=arguments["iree_target_triple"],
             ireec_flags=arguments["ireec_flags"],
-            variant="decode",
-            decomp_attn=arguments["decomp_attn"],
+            decomp_attn=True,
             attn_spec=arguments["attn_spec"],
-            exit_on_vmfb=True,
+            exit_on_vmfb=False,
         )
         arguments["external_weight_path"] = (
             self.safe_model_name
@@ -316,18 +278,7 @@ def test03_ExportVaeModelDecode(self):
             + "_vae_decode."
             + arguments["external_weights"]
         )
-        arguments["vmfb_path"] = (
-            self.safe_model_name
-            + "_"
-            + str(arguments["height"])
-            + "x"
-            + str(arguments["width"])
-            + "_"
-            + arguments["precision"]
-            + "_vae_decode_"
-            + arguments["device"]
-            + ".vmfb"
-        )
+        arguments["vmfb_path"] = vae_vmfb
         example_input = torch.ones(
             arguments["batch_size"],
             4,
@@ -376,7 +327,7 @@ def test04_ExportVaeModelEncode(self):
             self.skipTest(
                 "Compilation error on cpu, vulkan and rocm; To be tested on cuda."
             )
-        vae.export_vae_model(
+        vae_vmfb = vae.export_vae_model(
             vae_model=self.vae_model,
             # This is a public model, so no auth required
             hf_model_name=arguments["hf_model_name"],
@@ -392,10 +343,9 @@ def test04_ExportVaeModelEncode(self):
             + "_vae_encode."
             + arguments["external_weights"],
             device=arguments["device"],
-            target_triple=arguments["iree_target_triple"],
+            target=arguments["iree_target_triple"],
             ireec_flags=arguments["ireec_flags"],
-            variant="encode",
-            decomp_attn=arguments["decomp_attn"],
+            decomp_attn=True,
             exit_on_vmfb=True,
         )
         arguments["external_weight_path"] = (
@@ -405,18 +355,7 @@ def test04_ExportVaeModelEncode(self):
             + "_vae_encode."
             + arguments["external_weights"]
         )
-        arguments["vmfb_path"] = (
-            self.safe_model_name
-            + "_"
-            + str(arguments["height"])
-            + "x"
-            + str(arguments["width"])
-            + "_"
-            + arguments["precision"]
-            + "_vae_encode_"
-            + arguments["device"]
-            + ".vmfb"
-        )
+        arguments["vmfb_path"] = vae_vmfb
         example_input = torch.ones(
             arguments["batch_size"],
             3,
@@ -460,100 +399,103 @@ def test04_ExportVaeModelEncode(self):
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
     def test05_t2i_generate_images(self):
+        if arguments["device"] in ["vulkan", "cuda"]:
+            self.skipTest("Have issues with submodels on vulkan, cuda")
+        from turbine_models.custom_models.sd_inference.sd_pipeline import (
+            SharkSDPipeline,
+        )
+
+        decomp_attn = {
+            "text_encoder": False,
+            "unet": False,
+            "vae": True,
+        }
+        sd_pipe = SharkSDPipeline(
+            arguments["hf_model_name"],
+            arguments["height"],
+            arguments["width"],
+            arguments["batch_size"],
+            arguments["max_length"],
+            arguments["precision"],
+            arguments["device"],
+            arguments["iree_target_triple"],
+            ireec_flags=None,  # ireec_flags
+            attn_spec=arguments["attn_spec"],
+            decomp_attn=decomp_attn,
+            pipeline_dir="test_vmfbs",  # pipeline_dir
+            external_weights_dir="test_weights",  # external_weights_dir
+            external_weights=arguments["external_weights"],
+            num_inference_steps=arguments["num_inference_steps"],
+            cpu_scheduling=True,
+            scheduler_id=arguments["scheduler_id"],
+            shift=None,  # shift
+            use_i8_punet=False,
+        )
+        sd_pipe.prepare_all()
+        sd_pipe.load_map()
+        output = sd_pipe.generate_images(
+            arguments["prompt"],
+            arguments["negative_prompt"],
+            arguments["num_inference_steps"],
+            1,  # batch count
+            arguments["guidance_scale"],
+            arguments["seed"],
+            True,
+            arguments["scheduler_id"],
+            True,  # return_img
+        )
+        assert output is not None
+
+    @pytest.mark.skip(reason="Needs sdxl_quantized branch of IREE")
+    def test06_t2i_generate_images_punet(self):
         if arguments["device"] in ["vulkan", "cuda", "rocm"]:
             self.skipTest(
                 "Have issues with submodels on vulkan, cuda; ROCM hangs on mi250 despite submodels working."
             )
-        mlirs = {
-            "vae_decode": None,
-            "prompt_encoder": None,
-            "scheduled_unet": None,
-            "pipeline": None,
-            "full_pipeline": None,
-        }
-        vmfbs = {
-            "vae_decode": None,
-            "prompt_encoder": None,
-            "scheduled_unet": None,
-            "pipeline": None,
-            "full_pipeline": None,
-        }
-        weights = {
-            "vae_decode": None,
-            "prompt_encoder": None,
-            "scheduled_unet": None,
-            "pipeline": None,
-            "full_pipeline": None,
-        }
+        from turbine_models.custom_models.sd_inference.sd_pipeline import (
+            SharkSDPipeline,
+        )
 
-        if not arguments["pipeline_dir"]:
-            pipe_id_list = [
-                "sdxl_1_0",
-                str(arguments["height"]),
-                str(arguments["width"]),
-                str(arguments["max_length"]),
-                arguments["precision"],
-                arguments["device"],
-            ]
-            arguments["pipeline_dir"] = os.path.join(
-                ".",
-                "_".join(pipe_id_list),
-            )
-        ireec_flags = {
-            "unet": arguments["ireec_flags"],
-            "vae": arguments["ireec_flags"],
-            "clip": arguments["ireec_flags"],
-            "pipeline": arguments["ireec_flags"],
+        decomp_attn = {
+            "text_encoder": False,
+            "unet": False,
+            "vae": True,
         }
-        user_mlir_list = []
-        for submodel_id, mlir_path in zip(mlirs.keys(), user_mlir_list):
-            if submodel_id in mlir_path:
-                mlirs[submodel_id] = mlir_path
-        external_weights_dir = arguments["pipeline_dir"]
-        sdxl_pipe = sdxl_compiled_pipeline.SharkSDXLPipeline(
+        sd_pipe = SharkSDPipeline(
             arguments["hf_model_name"],
-            arguments["scheduler_id"],
             arguments["height"],
             arguments["width"],
-            arguments["precision"],
-            arguments["max_length"],
             arguments["batch_size"],
-            arguments["num_inference_steps"],
+            arguments["max_length"],
+            arguments["precision"],
             arguments["device"],
             arguments["iree_target_triple"],
-            ireec_flags,
-            arguments["attn_spec"],
-            arguments["decomp_attn"],
-            arguments["pipeline_dir"],
-            external_weights_dir,
-            arguments["external_weights"],
-        )
-        vmfbs, weights = sdxl_pipe.check_prepared(
-            mlirs, vmfbs, weights, interactive=False
-        )
-        sdxl_pipe.load_pipeline(
-            vmfbs, weights, arguments["rt_device"], arguments["compiled_pipeline"]
-        )
-        sdxl_pipe.generate_images(
+            ireec_flags=None,  # ireec_flags
+            attn_spec=arguments["attn_spec"],
+            decomp_attn=decomp_attn,
+            pipeline_dir="test_vmfbs",  # pipeline_dir
+            external_weights_dir="test_weights",  # external_weights_dir
+            external_weights=arguments["external_weights"],
+            num_inference_steps=arguments["num_inference_steps"],
+            cpu_scheduling=True,
+            scheduler_id=arguments["scheduler_id"],
+            shift=None,  # shift
+            use_i8_punet=True,
+        )
+        sd_pipe.prepare_all()
+        sd_pipe.load_map()
+        output = sd_pipe.generate_images(
             arguments["prompt"],
             arguments["negative_prompt"],
-            1,
+            arguments["num_inference_steps"],
+            1,  # batch count
             arguments["guidance_scale"],
             arguments["seed"],
+            True,
+            arguments["scheduler_id"],
+            True,  # return_img
         )
-        print("Image generation complete.")
-        os.remove(os.path.join(arguments["pipeline_dir"], "prompt_encoder.vmfb"))
-        os.remove(
-            os.path.join(
-                arguments["pipeline_dir"],
-                arguments["scheduler_id"]
-                + "_unet_"
-                + str(arguments["num_inference_steps"])
-                + ".vmfb",
-            )
-        )
-        os.remove(os.path.join(arguments["pipeline_dir"], "vae_decode.vmfb"))
-        os.remove(os.path.join(arguments["pipeline_dir"], "full_pipeline.vmfb"))
+        assert output is not None
 
 
 if __name__ == "__main__":

From 323ecf4e14f6699f109a96d6377d24cdfbf00daf Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 11:06:39 -0500
Subject: [PATCH 158/174] [SD3] Fix text encoder impls import in text encoder
 runner.

---
 .../custom_models/sd3_inference/sd3_text_encoders_runner.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
index 3a590b62c..ec54227ab 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders_runner.py
@@ -1,5 +1,9 @@
 from turbine_models.model_runner import vmfbRunner
-from text_encoder_impls import SD3Tokenizer, T5XXLTokenizer, SDXLClipGTokenizer
+from turbine_models.custom_models.sd3_inference.text_encoder_impls import (
+    SD3Tokenizer,
+    T5XXLTokenizer,
+    SDXLClipGTokenizer,
+)
 from iree import runtime as ireert
 import torch
 import numpy as np

From 4bef98bd6a24c3f3c71c82817fcf6bcf405171e6 Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:45:07 -0700
Subject: [PATCH 159/174] Minor fixes for SDXL/unet (#770)

The reason for updating the revision hash is this PR by Stella in
sharktank: https://github.com/nod-ai/sharktank/pull/93. Because we are
using sharktank TOM, we need to update here too so that it gives
sharktank the expected quant_params.json.

---------

Signed-off-by: saienduri <saimanas.enduri@amd.com>
---
 models/turbine_models/custom_models/sdxl_inference/unet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 4d3af598c..6eeb5623b 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -92,7 +92,7 @@ def get_punet_model(hf_model_name, external_weight_path, precision="i8"):
     if precision == "i8":
         repo_id = "amd-shark/sdxl-quant-models"
         subfolder = "unet/int8"
-        revision = "82e06d6ea22ac78102a9aded69e8ddfb9fa4ae37"
+        revision = "942e771bf0c2657a8b33380103d04747a75dfa4a"
     elif precision in ["fp16", "fp32"]:
         repo_id = hf_model_name
         subfolder = "unet"

From 027b615b8b032c45c513c8e86612a1e83df2ab4a Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Wed, 10 Jul 2024 15:45:29 -0500
Subject: [PATCH 160/174] [WIP] add SDXL ml-perf harness model directory to
 choose correct model_map

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .../custom_models/sd_inference/sd_pipeline.py               | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 868872479..7199a72f6 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -186,7 +186,11 @@ def get_sd_model_map(hf_model_name):
         name = hf_model_name["text_encoder"]
     else:
         name = hf_model_name
-    if name in ["stabilityai/sdxl-turbo", "stabilityai/stable-diffusion-xl-base-1.0"]:
+    if name in [
+        "stabilityai/sdxl-turbo",
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        "/models/SDXL/official_pytorch/fp16/stable_diffusion_fp16//checkpoint_pipe",
+    ]:
         return sdxl_model_map
     elif "stabilityai/stable-diffusion-3" in name:
         return sd3_model_map

From 08a148862a8f64beb34370e465282027f00c390a Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Wed, 10 Jul 2024 17:02:30 -0500
Subject: [PATCH 161/174] [WIP] Fix naming issue for punet external weights
 file

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 models/turbine_models/custom_models/pipeline_base.py       | 6 +-----
 models/turbine_models/custom_models/sdxl_inference/unet.py | 5 +----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
index d46e20b84..4a02e73b2 100644
--- a/models/turbine_models/custom_models/pipeline_base.py
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -547,11 +547,7 @@ def export_submodel(
 
             self.map[submodel]["export_args"]["external_weight_path"] = os.path.join(
                 self.external_weights_dir,
-                utils.create_safe_name(
-                    self.map[submodel]["export_args"].get("hf_model_name", ""), ""
-                )
-                + f"_{submodel}_{self.map[submodel]['precision']}."
-                + self.map[submodel]["external_weights"],
+                self.map[submodel]["export_args"]["external_weight_path"],
             )
 
         elif not self.map[submodel].get("external_weights"):
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet.py b/models/turbine_models/custom_models/sdxl_inference/unet.py
index 6eeb5623b..bd36db763 100644
--- a/models/turbine_models/custom_models/sdxl_inference/unet.py
+++ b/models/turbine_models/custom_models/sdxl_inference/unet.py
@@ -111,10 +111,7 @@ def download(filename):
 
     if precision == "i8":
         results["quant_params.json"] = download("quant_params.json")
-        ds_filename = (
-            os.path.basename(external_weight_path).split("unet")[0]
-            + "punet_dataset_i8.irpa"
-        )
+        ds_filename = os.path.basename(external_weight_path)
         output_path = os.path.join(output_dir, ds_filename)
         ds = get_punet_dataset(
             results["config.json"],

From 8a1bda1a80df7689577b0cf51832cf0a10c6a8db Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Wed, 10 Jul 2024 18:49:12 -0500
Subject: [PATCH 162/174] Fix is_sdxl not checking the base_model_name properly

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 models/turbine_models/custom_models/sd_inference/sd_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 7199a72f6..a450afc4d 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -311,7 +311,7 @@ def __init__(
             else hf_model_name.get("unet", hf_model_name.get("mmdit"))
         )
         self.is_img2img = False
-        self.is_sdxl = "xl" in self.base_model_name
+        self.is_sdxl = "xl" in self.base_model_name.lower()
         self.is_sd3 = "stable-diffusion-3" in self.base_model_name
         if self.is_sdxl:
             if self.split_scheduler:

From beb2c13daf3fdf552391a21b6d07c0bc71528de1 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Thu, 11 Jul 2024 10:56:02 -0500
Subject: [PATCH 163/174] Fix sd_inference/vae case for SDXL

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 models/turbine_models/custom_models/sd_inference/vae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index d9c0fd743..bbcf475e8 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -142,7 +142,7 @@ def export_vae_model(
     if "stable-diffusion-3" in hf_model_name:
         vae_model = SD3VaeModel(hf_model_name)
     else:
-        if "xl" in hf_model_name and precision == "fp16":
+        if "xl" in hf_model_name.lower() and precision == "fp16":
             custom_vae = "madebyollin/sdxl-vae-fp16-fix"
         else:
             custom_vae = None

From 672f3e67508cac13c98d36b9c418a00182142bf1 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 14:02:31 -0500
Subject: [PATCH 164/174] Update SD3 tests and exports.

---
 .../custom_models/sd3_inference/sd3_mmdit.py  |  4 +-
 .../sd3_inference/sd3_text_encoders.py        | 16 ++--
 .../custom_models/sd3_inference/sd3_vae.py    |  6 +-
 .../custom_models/sd_inference/utils.py       |  2 +
 models/turbine_models/tests/conftest.py       | 14 +++
 models/turbine_models/tests/sd3_test.py       | 95 ++++++++-----------
 6 files changed, 68 insertions(+), 69 deletions(-)

diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
index d87ff5993..b71d3129e 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -177,7 +177,7 @@ def export_mmdit_model(
             device,
             target_triple,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
@@ -265,7 +265,7 @@ class CompiledMmdit(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             return_path=True,
             attn_spec=attn_spec,
         )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
index 33107aa9f..08d9c1621 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -123,7 +123,7 @@ def export_text_encoders(
     device=None,
     target_triple=None,
     ireec_flags=None,
-    exit_on_vmfb=True,
+    exit_on_vmfb=False,
     pipeline_dir=None,
     input_mlir=None,
     attn_spec=None,
@@ -132,7 +132,7 @@ def export_text_encoders(
 
     safe_name = utils.create_safe_name(
         hf_model_name,
-        f"_bs{output_batchsize}_{str(max_length)}_{precision}_text_encoders-{device}",
+        f"_bs{batch_size}_{str(max_length)}_{precision}_text_encoders",
     )
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)
@@ -143,7 +143,7 @@ def export_text_encoders(
             device,
             target_triple,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
@@ -151,7 +151,7 @@ def export_text_encoders(
         )
         return vmfb_path
     model = TextEncoderModule(
-        batch_size=output_batchsize,
+        batch_size=batch_size,
     )
     mapper = {}
 
@@ -199,8 +199,8 @@ class CompiledTextEncoder(CompiledModule):
         "input_shapes": [(1, max_length, 2) for x in range(6)],
         "input_dtypes": ["int64" for x in range(6)],
         "output_shapes": [
-            (2 * output_batchsize, max_length * 2, 4096),
-            (2 * output_batchsize, 2048),
+            (2 * batch_size, max_length * 2, 4096),
+            (2 * batch_size, 2048),
         ],
         "output_dtypes": ["float32"],
     }
@@ -214,12 +214,12 @@ class CompiledTextEncoder(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             return_path=not exit_on_vmfb,
             const_expr_hoisting=True,
             attn_spec=attn_spec,
         )
-        return module_str, vmfb_path
+        return vmfb_path
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
index e6578bb08..ff24864a6 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
@@ -73,7 +73,7 @@ def export_vae_model(
     dtype = torch.float16 if precision == "fp16" else torch.float32
     safe_name = utils.create_safe_name(
         hf_model_name,
-        f"_bs{batch_size}_{height}x{width}_{precision}_vae_{device}",
+        f"_bs{batch_size}_{height}x{width}_{precision}_vae",
     )
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)
@@ -84,7 +84,7 @@ def export_vae_model(
             device,
             target_triple,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             mlir_source="file",
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
@@ -156,7 +156,7 @@ class CompiledVae(CompiledModule):
             device,
             target_triple,
             ireec_flags,
-            safe_name + "_" + target_triple,
+            safe_name,
             return_path=not exit_on_vmfb,
             attn_spec=attn_spec,
         )
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 447076d42..57ade2fb6 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -106,6 +106,8 @@
 
 _IREE_BACKEND_MAP = {
     "cpu": "llvm-cpu",
+    "local-task": "llvm-cpu",
+    "local-sync": "llvm-cpu",
     "rocm": "rocm",
     "rocm-legacy": "rocm",
     "hip": "rocm",
diff --git a/models/turbine_models/tests/conftest.py b/models/turbine_models/tests/conftest.py
index d93aa2e60..4292c7390 100644
--- a/models/turbine_models/tests/conftest.py
+++ b/models/turbine_models/tests/conftest.py
@@ -37,6 +37,7 @@ def pytest_addoption(parser):
     parser.addoption("--compile_to", action="store", default=None)
     parser.addoption("--external_weights", action="store", default="safetensors")
     parser.addoption("--decomp_attn", action="store", default=False)
+    parser.addoption("--vae_decomp_attn", action="store", default=False)
     parser.addoption("--attn_spec", action="store", default="")
     # Compiler Options
     parser.addoption("--device", action="store", default="cpu")
@@ -51,3 +52,16 @@ def pytest_addoption(parser):
     parser.addoption("--benchmark", action="store_true", default=False)
     parser.addoption("--tracy_profile", action="store_true", default=False)
     parser.addoption("--compiled_pipeline", type=bool, default=False)
+    parser.addoption("--model_path", type=str, action="store", default=None)
+    parser.addoption("--vae_model_path", type=str, action="store", default=None)
+    parser.addoption("--pipeline_vmfb_path", type=str, action="store", default=None)
+    parser.addoption("--scheduler_vmfb_path", type=str, action="store", default=None)
+    parser.addoption("--split_scheduler", action="store_true", default=True)
+    parser.addoption("--cpu_scheduling", action="store_true", default=True)
+    parser.addoption("--npu_delegate_path", type=str, action="store", default=None)
+    parser.addoption("--clip_precision", type=str, action="store", default=None)
+    parser.addoption("--mmdit_precision", type=str, action="store", default=None)
+    parser.addoption("--unet_precision", type=str, action="store", default=None)
+    parser.addoption("--vae_precision", type=str, action="store", default=None)
+    parser.addoption("--shift", type=float, action="store", default=None)
+    parser.addoption("--denoise", action="store_true", default=None)
diff --git a/models/turbine_models/tests/sd3_test.py b/models/turbine_models/tests/sd3_test.py
index a627eb287..e44defe65 100644
--- a/models/turbine_models/tests/sd3_test.py
+++ b/models/turbine_models/tests/sd3_test.py
@@ -41,10 +41,8 @@
 @pytest.fixture(scope="session")
 def command_line_args(request):
     arguments["hf_auth_token"] = request.config.getoption("--hf_auth_token")
-    arguments["hf_model_name"] = request.config.getoption("--hf_model_name")
+    arguments["hf_model_name"] = "stabilityai/stable-diffusion-3-medium-diffusers"
     arguments["scheduler_id"] = request.config.getoption("--scheduler_id")
-    arguments["model_path"] = request.config.getoption("--model_path")
-    arguments["vae_model_path"] = request.config.getoption("--vae_model_path")
     arguments["prompt"] = request.config.getoption("--prompt")
     arguments["negative_prompt"] = request.config.getoption("--negative_prompt")
     arguments["num_inference_steps"] = int(
@@ -68,81 +66,59 @@ def command_line_args(request):
     arguments["pipeline_dir"] = request.config.getoption("--pipeline_dir")
     arguments["compiled_pipeline"] = request.config.getoption("--compiled_pipeline")
     arguments["npu_delegate_path"] = request.config.getoption("--npu_delegate_path")
-    arguments["clip_device"] = request.config.getoption("--clip_device")
-    arguments["mmdit_device"] = request.config.getoption("--mmdit_device")
-    arguments["vae_device"] = request.config.getoption("--vae_device")
-    arguments["clip_target"] = request.config.getoption("--clip_target")
-    arguments["vae_target"] = request.config.getoption("--vae_target")
-    arguments["mmdit_target"] = request.config.getoption("--mmdit_target")
     arguments["batch_size"] = int(request.config.getoption("--batch_size"))
     arguments["height"] = int(request.config.getoption("--height"))
     arguments["width"] = int(request.config.getoption("--width"))
     arguments["precision"] = request.config.getoption("--precision")
     arguments["vae_precision"] = request.config.getoption("--vae_precision")
     arguments["max_length"] = int(request.config.getoption("--max_length"))
-    arguments["vae_variant"] = request.config.getoption("--vae_variant")
     arguments["shift"] = request.config.getoption("--shift")
     arguments["vae_decomp_attn"] = request.config.getoption("--vae_decomp_attn")
-    arguments["vae_dtype"] = request.config.getoption("--vae_dtype")
     arguments["external_weights"] = request.config.getoption("--external_weights")
     arguments["decomp_attn"] = request.config.getoption("--decomp_attn")
-    arguments["exit_on_vmfb"] = request.config.getoption("--exit_on_vmfb")
-    arguments["output"] = request.config.getoption("--output")
     arguments["attn_spec"] = request.config.getoption("--attn_spec")
-    arguments["device"] = request.config.getoption("--device")
-    arguments["rt_device"] = request.config.getoption("--rt_device")
+    arguments["device"] = utils.iree_device_map(request.config.getoption("--device"))
+    arguments["backend"] = utils.iree_backend_map(
+        request.config.getoption("--device").split("://")[0]
+    )
     arguments["iree_target_triple"] = request.config.getoption("--iree_target_triple")
     arguments["ireec_flags"] = request.config.getoption("--ireec_flags")
-    arguments["attn_flags"] = request.config.getoption("--attn_flags")
-    arguments["clip_flags"] = request.config.getoption("--clip_flags")
-    arguments["vae_flags"] = request.config.getoption("--vae_flags")
-    arguments["mmdit_flags"] = request.config.getoption("--mmdit_flags")
+    # TODO (Ean Garvey): align attention spec handling so we don't have to do this.
+    if not arguments["attn_spec"] and not arguments["decomp_attn"]:
+        if "gfx9" in arguments["iree_target_triple"]:
+            arguments["attn_spec"] = "mfma"
+        elif "gfx11" in arguments["iree_target_triple"]:
+            arguments["attn_spec"] = "wmma"
 
 
 @pytest.mark.usefixtures("command_line_args")
 class StableDiffusion3Test(unittest.TestCase):
     def setUp(self):
         self.safe_model_name = create_safe_name(arguments["hf_model_name"], "")
-        self.mmdit_model = sd3_mmdit.MMDiTModel(
-            arguments["hf_model_name"],
-            precision=arguments["precision"],
-        )
-        self.vae_model = sd3_vae.VaeModel(
-            # This is a public model, so no auth required
-            arguments["hf_model_name"],
-            custom_vae=(
-                "madebyollin/sdxl-vae-fp16-fix"
-                if arguments["precision"] == "fp16"
-                else None
-            ),
-        )
 
+    @pytest.mark.xfail(reason="Numerics issues on ~.01 percent of output values")
     def test01_ExportPromptEncoder(self):
         if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("Not testing sd3 on vk or cuda")
         arguments["external_weight_path"] = (
-            arguments["external_weight_path"]
-            + "/sd3_text_encoders_"
-            + arguments["precision"]
-            + ".irpa"
+            self.safe_model_name + "_text_encoders_" + arguments["precision"] + ".irpa"
         )
-        _, prompt_encoder_vmfb = sd3_text_encoders.export_text_encoders(
+        prompt_encoder_vmfb = sd3_text_encoders.export_text_encoders(
             arguments["hf_model_name"],
-            hf_auth_token=None,
             max_length=arguments["max_length"],
             precision=arguments["precision"],
             compile_to="vmfb",
             external_weights=arguments["external_weights"],
             external_weight_path=arguments["external_weight_path"],
-            device=arguments["device"],
-            target_triple=arguments["clip_target"],
+            device=arguments["backend"],
+            target_triple=arguments["iree_target_triple"],
             ireec_flags=arguments["ireec_flags"],
-            exit_on_vmfb=True,
+            exit_on_vmfb=False,
             pipeline_dir=arguments["pipeline_dir"],
             input_mlir=None,
             attn_spec=arguments["attn_spec"],
-            output_batchsize=arguments["batch_size"],
-            decomp_attn=arguments["decomp_attn"],
+            batch_size=arguments["batch_size"],
+            decomp_attn=True,
         )
         tokenizer = SD3Tokenizer()
         (
@@ -158,7 +134,7 @@ def test01_ExportPromptEncoder(self):
             turbine_output2,
         ) = sd3_text_encoders_runner.run_prompt_encoder(
             prompt_encoder_vmfb,
-            arguments["rt_device"],
+            arguments["device"],
             arguments["external_weight_path"],
             text_input_ids_list,
             uncond_input_ids_list,
@@ -174,9 +150,16 @@ def test01_ExportPromptEncoder(self):
         np.testing.assert_allclose(torch_output1, turbine_output1, rtol, atol)
         np.testing.assert_allclose(torch_output2, turbine_output2, rtol, atol)
 
+    @pytest.mark.xfail(
+        reason="Runners need secure dedicated access to gated HF repo for imports."
+    )
     def test02_ExportMMDITModel(self):
         if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("Not testing on vulkan or cuda")
+        self.mmdit_model = sd3_mmdit.MMDiTModel(
+            arguments["hf_model_name"],
+            dtype=torch.float16 if arguments["precision"] == "fp16" else torch.float32,
+        )
         arguments["external_weight_path"] = (
             self.safe_model_name
             + "_"
@@ -186,18 +169,16 @@ def test02_ExportMMDITModel(self):
         )
         sd3_mmdit.export_mmdit_model(
             mmdit_model=self.mmdit_model,
-            # This is a public model, so no auth required
             hf_model_name=arguments["hf_model_name"],
             batch_size=arguments["batch_size"],
             height=arguments["height"],
             width=arguments["width"],
             precision=arguments["precision"],
             max_length=arguments["max_length"],
-            hf_auth_token=None,
             compile_to="vmfb",
             external_weights=arguments["external_weights"],
             external_weight_path=arguments["external_weight_path"],
-            device=arguments["mmdit_device"],
+            device=arguments["backend"],
             target_triple=arguments["iree_target_triple"],
             ireec_flags=arguments["ireec_flags"],
             decomp_attn=arguments["decomp_attn"],
@@ -266,15 +247,16 @@ def test02_ExportMMDITModel(self):
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
+    @pytest.mark.xfail(
+        reason="Runners need secure dedicated access to gated HF repo for imports."
+    )
     def test03_ExportVaeModelDecode(self):
         if arguments["device"] in ["vulkan", "cuda"]:
             self.skipTest("not testing vulkan or cuda")
-        sd3_vae.export_vae_model(
-            vae_model=self.vae_model,
+        vae_model = sd3_vae.VaeModel(
             # This is a public model, so no auth required
-            exit_on_vmfb=True,
+            arguments["hf_model_name"],
         )
-
         arguments["external_weight_path"] = (
             self.safe_model_name
             + "_"
@@ -283,7 +265,7 @@ def test03_ExportVaeModelDecode(self):
             + arguments["external_weights"]
         )
         sd3_vae.export_vae_model(
-            self.vae_model,
+            vae_model,
             hf_model_name=arguments["hf_model_name"],
             batch_size=arguments["batch_size"],
             height=arguments["height"],
@@ -292,10 +274,9 @@ def test03_ExportVaeModelDecode(self):
             compile_to="vmfb",
             external_weights=arguments["external_weights"],
             external_weight_path=arguments["external_weight_path"],
-            device=arguments["device"],
+            device=arguments["backend"],
             target_triple=arguments["iree_target_triple"],
             ireec_flags=arguments["ireec_flags"],
-            variant="decode",
             decomp_attn=arguments["decomp_attn"],
             attn_spec=arguments["attn_spec"],
         )
@@ -322,7 +303,7 @@ def test03_ExportVaeModelDecode(self):
         if arguments["precision"] == "fp16":
             example_input = example_input.half()
         turbine = sd3_vae_runner.run_vae(
-            arguments["rt_device"],
+            arguments["device"],
             example_input,
             arguments["vmfb_path"],
             arguments["hf_model_name"],
@@ -354,7 +335,9 @@ def test03_ExportVaeModelDecode(self):
 
         np.testing.assert_allclose(torch_output, turbine, rtol, atol)
 
-    @pytest.mark.skip("Waiting on inference plumbing for generalized sd pipeline")
+    @pytest.mark.skip(
+        reason="Waiting on inference plumbing for generalized sd pipeline"
+    )
     def test04SDPipeline(self):
         from turbine_models.custom_models.sd_inference.sd_pipeline import (
             SharkSDPipeline,

From 021948f738f50875386c2d07d1033f37a8eb41d6 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 16:48:36 -0500
Subject: [PATCH 165/174] Only setup export weights filepath if weights need to
 be created.

---
 models/turbine_models/custom_models/pipeline_base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
index 4a02e73b2..2341e8360 100644
--- a/models/turbine_models/custom_models/pipeline_base.py
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -541,7 +541,11 @@ def export_submodel(
         if not os.path.exists(self.pipeline_dir):
             os.makedirs(self.pipeline_dir)
 
-        if self.map[submodel]["external_weights"] and self.external_weights_dir:
+        if (
+            self.map[submodel]["external_weights"]
+            and self.external_weights_dir
+            and not self.map[submodel].get("weights")
+        ):
             if not os.path.exists(self.external_weights_dir):
                 os.makedirs(self.external_weights_dir, exist_ok=False)
 

From 6396df1333605a62b30eeacc719e9a5e790dd76c Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 17:43:26 -0500
Subject: [PATCH 166/174] Add a model map attribute for models that need
 weights for exports.

---
 models/turbine_models/custom_models/pipeline_base.py      | 8 ++++++--
 .../custom_models/sd_inference/sd_pipeline.py             | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
index 2341e8360..752725358 100644
--- a/models/turbine_models/custom_models/pipeline_base.py
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -501,12 +501,10 @@ def is_prepared(self, vmfbs, weights):
                         )
                 if len(candidates) == 1:
                     self.map[key]["weights"] = candidates[0]
-                    self.map[key]["export_args"]["external_weight_path"] = None
                 elif len(candidates) > 1:
                     print(f"Multiple weight files found for {key}: {candidates}")
                     print(f"Choosing {candidates[0]} for {key}.")
                     self.map[key][weights] = candidates[0]
-                    self.map[key]["export_args"]["external_weight_path"] = None
                 elif self.map[key].get("external_weights"):
                     # weights not found in external_weights_dir. Add to list of files to generate.
                     missing[key].append("weights")
@@ -553,6 +551,12 @@ def export_submodel(
                 self.external_weights_dir,
                 self.map[submodel]["export_args"]["external_weight_path"],
             )
+        elif self.map[submodel].get("weights") and self.map[submodel].get(
+            "use_weights_to_export"
+        ):
+            self.map[submodel]["export_args"]["external_weight_path"] = self.map[
+                submodel
+            ]["weights"]
 
         elif not self.map[submodel].get("external_weights"):
             print(
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index a450afc4d..cd99c51df 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -334,6 +334,7 @@ def __init__(
         if self.use_i8_punet:
             self.map["unet"]["export_args"]["precision"] = "i8"
             self.map["unet"]["export_args"]["use_punet"] = True
+            self.map["unet"]["use_weights_for_export"] = True
             self.map["unet"]["keywords"].append("punet")
             self.map["unet"]["module_name"] = "compiled_punet"
             self.map["unet"]["function_name"] = "main"

From 4814c92d4963b91d10cb13885ae345df38e3b5eb Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 17:49:29 -0500
Subject: [PATCH 167/174] Only pop pipeline wrappers from model map if they
 exist.

---
 .../custom_models/sd_inference/sd_pipeline.py               | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index cd99c51df..65c23fecd 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -315,8 +315,10 @@ def __init__(
         self.is_sd3 = "stable-diffusion-3" in self.base_model_name
         if self.is_sdxl:
             if self.split_scheduler:
-                self.map.pop("unetloop")
-                self.map.pop("fullpipeline")
+                if self.map.get("unetloop"):
+                    self.map.pop("unetloop")
+                if self.map.get("fullpipeline"):
+                    self.map.pop("fullpipeline")
             self.tokenizers = [
                 CLIPTokenizer.from_pretrained(
                     self.base_model_name, subfolder="tokenizer"

From 4468ed8de42a2b4ede2624fc8843e5e30e86a850 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 22:09:00 -0500
Subject: [PATCH 168/174] Fix gpu scheduling for sdxl, fixup for batched clip
 metadata

---
 .../custom_models/pipeline_base.py            |  56 +++++----
 .../sd3_inference/sd3_text_encoders.py        |   4 +-
 .../custom_models/sd_inference/schedulers.py  |  62 +++++-----
 .../custom_models/sd_inference/sd_cmd_opts.py |   2 +-
 .../custom_models/sd_inference/sd_pipeline.py | 109 ++++++++++++------
 .../custom_models/sd_inference/utils.py       |   5 +
 .../sdxl_inference/sdxl_prompt_encoder.py     |   2 +-
 7 files changed, 146 insertions(+), 94 deletions(-)

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
index 752725358..3260d6f9c 100644
--- a/models/turbine_models/custom_models/pipeline_base.py
+++ b/models/turbine_models/custom_models/pipeline_base.py
@@ -132,7 +132,8 @@ def get_metadata(self):
                 self.metadata[function_name] = None
 
     def _validate_or_convert_inputs(self, function_name, inputs):
-        if self.metadata:
+        val_inputs = [None for i in inputs]
+        if self.metadata.get(function_name):
             expected_input_shapes = self.metadata.get(function_name, {}).get(
                 "input_shapes"
             )
@@ -143,51 +144,51 @@ def _validate_or_convert_inputs(self, function_name, inputs):
             )
             if expected_input_dtypes:
                 expected_input_dtypes = ast.literal_eval(expected_input_dtypes)
-            if not isinstance(expected_input_shapes, list):
-                expected_input_shapes = [expected_input_shapes]
             if not expected_input_dtypes:
                 pass
             if not expected_input_shapes:
                 logging.warning(
                     f"No input shapes found for {self.module_name}['{function_name}']."
                 )
-                for i in inputs:
+                for idx, i in enumerate(inputs):
                     if not isinstance(i, ireert.DeviceArray):
-                        i = ireert.asdevicearray(self.device, i)
+                        val_inputs[idx] = ireert.asdevicearray(self.device, i)
                 pass
+            if not isinstance(expected_input_shapes, list):
+                expected_input_shapes = [expected_input_shapes]
             for i, input_dtype in enumerate(expected_input_dtypes):
                 if not isinstance(inputs[i], ireert.DeviceArray):
-                    if isinstance(inputs[i], torch.Tensor) or isinstance(
-                        inputs[i], torch.HalfTensor
-                    ):
-                        new_input = inputs[i].float().cpu().numpy()
-                    else:
-                        new_input = inputs[i]
-
-                    inputs[i] = ireert.asdevicearray(
-                        self.device, new_input, input_dtype
+                    val_inputs[i] = ireert.asdevicearray(
+                        self.device, inputs[i], input_dtype
                     )
-                if str(inputs[i].dtype).split(".")[-1] != input_dtype:
+                elif str(inputs[i].dtype).split(".")[-1] != input_dtype:
                     logging.warning(
                         f"Converting input {i} to {input_dtype} for {self.module_name}['{function_name}']."
                     )
-                    inputs[i] = inputs[i].astype(input_dtype)
+                    val_inputs[i] = inputs[i].astype(input_dtype)
+                else:
+                    val_inputs[i] = inputs[i]
             for i, input_shape in enumerate(expected_input_shapes):
                 if isinstance(input_shape, str):
                     input_shape = ast.literal_eval(input_shape)
                 elif not input_shape:
                     continue
-                if tuple(inputs[i].shape) != tuple(input_shape):
-                    raise ValueError(
-                        f"Expected input {i} to be of shape {input_shape} for {self.module_name}['{function_name}'], got {str(tuple(inputs[i].shape))}."
-                    )
+                actual = tuple(val_inputs[i].shape)
+                expected = tuple(input_shape)
+                for idx, shape in enumerate(expected):
+                    if shape == "?":
+                        pass
+                    elif actual[idx] != shape:
+                        raise ValueError(
+                            f"Expected input {i} to be of shape {input_shape} for {self.module_name}['{function_name}'], got {str(tuple(inputs[i].shape))}."
+                        )
         else:
-            logging.warning(
-                f"No metadata found for {self.module_name}['{function_name}']."
-            )
             for idx, i in enumerate(inputs):
                 if not isinstance(i, ireert.DeviceArray):
-                    inputs[idx] = ireert.asdevicearray(self.device, i)
+                    val_inputs[idx] = ireert.asdevicearray(self.device, i)
+                else:
+                    val_inputs[idx] = inputs[idx]
+        return val_inputs
 
     def _output_cast(self, output):
         if isinstance(output, tuple):
@@ -226,9 +227,9 @@ def _run_and_benchmark(self, function_name, inputs: list):
 
     def __call__(self, function_name, inputs: list):
         casted_output = False
-        self._validate_or_convert_inputs(function_name, inputs)
         if not isinstance(inputs, list):
             inputs = [inputs]
+        inputs = self._validate_or_convert_inputs(function_name, inputs)
         if self.benchmark:
             output = self._run_and_benchmark(function_name, inputs)
         else:
@@ -540,7 +541,7 @@ def export_submodel(
             os.makedirs(self.pipeline_dir)
 
         if (
-            self.map[submodel]["external_weights"]
+            self.map[submodel].get("external_weights")
             and self.external_weights_dir
             and not self.map[submodel].get("weights")
         ):
@@ -559,9 +560,6 @@ def export_submodel(
             ]["weights"]
 
         elif not self.map[submodel].get("external_weights"):
-            print(
-                "No external weights type specified using --external_weights, weights for imported .mlir files will not be externalized."
-            )
             self.map[submodel]["weights"] = None
 
         if weights_only:
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
index 08d9c1621..d3e4ecb54 100644
--- a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
+++ b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -159,7 +159,7 @@ def export_text_encoders(
         ".safetensors" not in external_weight_path
     ), "Original parameters format incompatible with IREE safetensors parser. Use '.irpa' instead."
 
-    input_args = [torch.empty([1, 77, 2], dtype=torch.int64) for x in range(6)]
+    input_args = [torch.empty([batch_size, 77, 2], dtype=torch.int64) for x in range(6)]
 
     decomp_list = []
     if decomp_attn == True:
@@ -196,7 +196,7 @@ class CompiledTextEncoder(CompiledModule):
 
     model_metadata_forward = {
         "model_name": "sd3_clip_t5xxl_text_encoders",
-        "input_shapes": [(1, max_length, 2) for x in range(6)],
+        "input_shapes": [(batch_size, max_length, 2) for x in range(6)],
         "input_dtypes": ["int64" for x in range(6)],
         "output_shapes": [
             (2 * batch_size, max_length * 2, 4096),
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 1a8cd8858..c638e05fa 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -10,6 +10,7 @@
 import torch
 from shark_turbine.aot import *
 import shark_turbine.ops.iree as ops
+from shark_turbine.transforms.general.add_metadata import AddMetadataPass
 from iree.compiler.ir import Context
 import iree.runtime as ireert
 import numpy as np
@@ -51,7 +52,7 @@ def scale_model_input(self, sample, t, timesteps):
             sample, t, timesteps
         )
 
-    def step(self, noise_pred, t, sample, guidance_scale, step_index):
+    def step(self, noise_pred, t, sample, step_index):
         return self.runner.ctx.modules.compiled_scheduler["run_step"](
             noise_pred, t, sample, guidance_scale, step_index
         )
@@ -78,7 +79,9 @@ def __init__(
         if "stable-diffusion-3" in hf_model_name:
             self.is_sd3 = True
         self.batch_size = batch_size
+        # Whether this will be used with CFG-enabled pipeline.
         self.do_classifier_free_guidance = True
+
         self.model.set_timesteps(num_inference_steps)
         self.timesteps = self.model.timesteps
         self.model.is_scale_input_called = True
@@ -107,24 +110,17 @@ def initialize(self, sample):
             timesteps.type(torch.float32),
         )
 
-    def prepare_model_input(self, sample, t, timesteps):
-        t = timesteps[t]
-        if self.do_classifier_free_guidance:
-            latent_model_input = torch.cat([sample] * 2)
-        else:
-            latent_model_input = sample
+    def prepare_model_input(self, sample, i, timesteps):
+        t = timesteps[i]
+
+        latent_model_input = sample
         return self.model.scale_model_input(latent_model_input, t).type(
             self.dtype
         ), t.type(self.dtype)
 
-    def step(self, noise_pred, t, sample, guidance_scale, i):
-        self.model._step_index = i
+    def step(self, noise_pred, t, sample):
+        self.model._step_index = self.model.index_for_timestep(t)
 
-        if self.do_classifier_free_guidance:
-            noise_preds = noise_pred.chunk(2)
-            noise_pred = noise_preds[0] + guidance_scale * (
-                noise_preds[1] - noise_preds[0]
-            )
         sample = self.model.step(noise_pred, t, sample, return_dict=False)[0]
         return sample.type(self.dtype)
 
@@ -244,6 +240,7 @@ def export_scheduler_model(
     upload_ir=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
+    iree_dtype = "float16" if precision == "fp16" else "float32"
     scheduler = get_scheduler(hf_model_name, scheduler_id)
     scheduler_module = SchedulingModel(
         hf_model_name, scheduler, height, width, batch_size, num_inference_steps, dtype
@@ -273,12 +270,6 @@ def export_scheduler_model(
         )
         return vmfb_path
 
-    do_classifier_free_guidance = True
-    if do_classifier_free_guidance:
-        init_batch_dim = 2
-    else:
-        init_batch_dim = 1
-
     sample = (
         batch_size,
         4,
@@ -286,7 +277,7 @@ def export_scheduler_model(
         width // 8,
     )
     noise_pred_shape = (
-        batch_size * init_batch_dim,
+        batch_size,
         4,
         height // 8,
         width // 8,
@@ -307,8 +298,6 @@ def export_scheduler_model(
         torch.empty(noise_pred_shape, dtype=dtype),
         torch.empty(1, dtype=dtype),
         torch.empty(sample, dtype=dtype),
-        torch.empty(1, dtype=dtype),
-        torch.empty(1, dtype=torch.int64),
     ]
 
     fxb = FxProgramsBuilder(scheduler_module)
@@ -353,8 +342,29 @@ class CompiledScheduler(CompiledModule):
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
     inst = CompiledScheduler(context=Context(), import_to=import_to)
 
-    module_str = str(CompiledModule.get_mlir_module(inst))
-
+    module = CompiledModule.get_mlir_module(inst)
+    metadata_modelname = "_".join(
+        [hf_model_name, scheduler_id, "scheduler", str(num_inference_steps)]
+    )
+    model_metadata_init = {
+        "model_name": metadata_modelname,
+        "input_shapes": [sample],
+        "input_dtypes": [iree_dtype],
+    }
+    model_metadata_prep = {
+        "model_name": metadata_modelname,
+        "input_shapes": [sample, (1,), ("?",)],
+        "input_dtypes": [iree_dtype, "int64", "float32"],
+    }
+    model_metadata_step = {
+        "model_name": metadata_modelname,
+        "input_shapes": [noise_pred_shape, (1,), sample],
+        "input_dtypes": [iree_dtype, iree_dtype, iree_dtype],
+    }
+    module = AddMetadataPass(module, model_metadata_init, "run_initialize").run()
+    module = AddMetadataPass(module, model_metadata_prep, "run_scale").run()
+    module = AddMetadataPass(module, model_metadata_step, "run_step").run()
+    module_str = str(module)
     if compile_to != "vmfb":
         return module_str
     elif compile_to == "vmfb":
@@ -366,8 +376,6 @@ class CompiledScheduler(CompiledModule):
             safe_name,
             return_path=True,
         )
-        if exit_on_vmfb:
-            exit()
         return vmfb
 
 
diff --git a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
index 8c68ad06c..aa5fa4a15 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
@@ -139,7 +139,7 @@ def is_valid_file(arg):
 
 p.add_argument(
     "--cpu_scheduling",
-    default=True,
+    default=False,
     action="store_true",
     help="Run scheduling on native pytorch CPU backend.",
 )
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 65c23fecd..9cbacc812 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -292,7 +292,6 @@ def __init__(
         self.model_max_length = max_length
         self.height = height
         self.width = width
-        self.latents_dtype = torch_dtypes[self.map["unet"]["precision"]]
         self.cpu_scheduling = cpu_scheduling
         self.scheduler_id = scheduler_id
         self.num_inference_steps = num_inference_steps
@@ -327,11 +326,21 @@ def __init__(
                     self.base_model_name, subfolder="tokenizer_2"
                 ),
             ]
+            self.latents_precision = self.map["unet"]["precision"]
+            self.scheduler_device = self.map["unet"]["device"]
+            self.scheduler_driver = self.map["unet"]["driver"]
+            self.scheduler_target = self.map["unet"]["target"]
         elif not self.is_sd3:
             self.tokenizer = CLIPTokenizer.from_pretrained(
                 self.base_model_name, subfolder="tokenizer"
             )
+            self.latents_precision = self.map["unet"]["precision"]
+            self.scheduler_device = self.map["unet"]["device"]
+            self.scheduler_driver = self.map["unet"]["driver"]
+            self.scheduler_target = self.map["unet"]["target"]
+        # TODO: Add SD3 init
 
+        self.latents_dtype = torch_dtypes[self.latents_precision]
         self.use_i8_punet = self.use_punet = use_i8_punet
         if self.use_i8_punet:
             self.map["unet"]["export_args"]["precision"] = "i8"
@@ -358,25 +367,52 @@ def load_scheduler(
         scheduler_id: str,
         steps: int = 30,
     ):
-        self.scheduler = schedulers.get_scheduler(
-            self.base_model_name, self.scheduler_id
-        )
         if self.is_sd3:
             scheduler_device = self.mmdit.device
         else:
             scheduler_device = self.unet.device
         if not self.cpu_scheduling:
+            self.map["scheduler"] = {
+                "module_name": "compiled_scheduler",
+                "export_fn": schedulers.export_scheduler_model,
+                "driver": self.scheduler_driver,
+                "export_args": {
+                    "hf_model_name": self.base_model_name,
+                    "scheduler_id": scheduler_id,
+                    "batch_size": self.batch_size,
+                    "height": self.height,
+                    "width": self.width,
+                    "num_inference_steps": steps,
+                    "precision": self.latents_precision,
+                    "compile_to": "vmfb",
+                    "device": self.scheduler_device,
+                    "target": self.scheduler_target,
+                    "pipeline_dir": self.pipeline_dir,
+                },
+            }
             self.scheduler = None
             self.num_inference_steps = steps
             self.scheduler_id = scheduler_id
-            scheduler_path = f"{scheduler_id}Scheduler_{self.num_inference_steps}"
+            scheduler_uid = "_".join(
+                [
+                    f"{scheduler_id}Scheduler",
+                    f"bs{self.batch_size}",
+                    "x".join([str(self.width), str(self.height)]),
+                    self.latents_precision,
+                    str(self.num_inference_steps),
+                    self.scheduler_target,
+                ]
+            )
+            scheduler_path = os.path.join(
+                self.pipeline_dir,
+                utils.create_safe_name(self.base_model_name, scheduler_uid),
+            )
             if not os.path.exists(scheduler_path):
-                scheduler_path, _ = self.export_submodel("scheduler")
+                self.export_submodel("scheduler")
+            else:
+                self.map["scheduler"]["vmfb"] = scheduler_path
             try:
-                self.scheduler = schedulers.SharkSchedulerWrapper(
-                    scheduler_device,
-                    scheduler_path,
-                )
+                self.load_submodel("scheduler")
             except:
                 print("JIT export of scheduler failed. Loading CPU scheduler.")
                 self.cpu_scheduling = True
@@ -433,11 +469,15 @@ def prepare_latents(
     ):
         if self.is_img2img:
             raise NotImplementedError("Image-to-image not supported yet.")
-        elif self.is_sdxl:
+        elif self.is_sdxl and self.cpu_scheduling:
+            self.scheduler.do_guidance = False
+            self.scheduler.repeat_sample = False
             sample, add_time_ids, step_indexes, timesteps = (
                 self.scheduler.initialize_sdxl(noise, num_inference_steps)
             )
             return sample, add_time_ids, step_indexes, timesteps
+        elif self.is_sdxl:
+            return self.scheduler("run_initialize", noise)
         elif self.is_sd3:
             raise NotImplementedError("Stable Diffusion 3 not supported yet.")
         else:
@@ -511,35 +551,33 @@ def _produce_latents_sdxl(
         latents, add_time_ids, step_indexes, timesteps = self.prepare_latents(
             sample, self.num_inference_steps, image, strength
         )
-        self.scheduler.do_guidance = False
-        self.scheduler.repeat_sample = False
+        guidance_scale = ireert.asdevicearray(
+            self.unet.device,
+            [guidance_scale],
+            dtype=self.map["unet"]["np_dtype"],
+        )
         for i, t in tqdm(enumerate(timesteps)):
             if self.cpu_scheduling:
-                step_index = i
+                latent_model_input, t = self.scheduler.scale_model_input(
+                    latents,
+                    t,
+                )
+                t = t.type(self.map["unet"]["torch_dtype"])
             else:
-                step_index = torch.tensor([i])
-            latent_model_input, t = self.scheduler.scale_model_input(
-                latents,
-                t,
-            )
+                step = torch.tensor([i], dtype=torch.float32)
+                latent_model_input, t = self.scheduler(
+                    "run_scale", [latents, step, timesteps]
+                )
+
             unet_inputs = [
                 latent_model_input,
                 t,
                 prompt_embeds,
                 add_text_embeds,
                 add_time_ids,
-                ireert.asdevicearray(
-                    self.unet.device,
-                    [guidance_scale],
-                    dtype=self.map["unet"]["np_dtype"],
-                ),
+                guidance_scale,
             ]
             if self.use_punet:
-                unet_inputs[1] = ireert.asdevicearray(
-                    self.unet.device,
-                    t,
-                    dtype=self.map["unet"]["np_dtype"],
-                )
                 for inp_idx, inp in enumerate(unet_inputs):
                     if not isinstance(inp, ireert.DeviceArray):
                         unet_inputs[inp_idx] = ireert.asdevicearray(
@@ -549,11 +587,14 @@ def _produce_latents_sdxl(
                 self.map["unet"]["function_name"],
                 unet_inputs,
             )
-            latents = self.scheduler.step(
-                noise_pred,
-                t,
-                latents,
-            )
+            if self.cpu_scheduling:
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                )
+            else:
+                latents = self.scheduler("run_step", [noise_pred, t, latents])
         return latents
 
     def generate_images(
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 57ade2fb6..b9afc6de8 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -329,6 +329,11 @@ def compile_to_vmfb(
 
 
 def create_safe_name(hf_model_name, model_name_str=""):
+    if not model_name_str:
+        model_name_str = ""
+    if model_name_str != "" and (not model_name_str.startswith("_")):
+        model_name_str = "_" + model_name_str
+
     safe_name = hf_model_name.split("/")[-1].strip() + model_name_str
     safe_name = re.sub("-", "_", safe_name)
     safe_name = re.sub("\.", "_", safe_name)
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 00b02d028..7c34b0c29 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -269,7 +269,7 @@ def encode_prompts_turbo(
 
     model_metadata_encode = {
         "model_name": hf_model_name + "_text_encoder",
-        "input_shapes": [str((1, max_length)) for i in range(4)],
+        "input_shapes": [str((batch_size, max_length)) for i in range(4)],
         "input_dtypes": ["int64" for i in range(4)],
         "use_attention_mask": False,
     }

From f922d05170aa3c28d274df60c4281d4522836643 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 22:33:35 -0500
Subject: [PATCH 169/174] Use amd-shark/sdxl-quant-models for F16 VAE weights

---
 .../custom_models/sd_inference/vae.py         | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index bbcf475e8..5ec326505 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -17,6 +17,8 @@
 from turbine_models.custom_models.sd_inference import utils
 import torch
 import torch._dynamo as dynamo
+from huggingface_hub import hf_hub_download
+from safetensors import safe_open
 from diffusers import AutoencoderKL
 import argparse
 from turbine_models.turbine_tank import turbine_tank
@@ -36,17 +38,19 @@ def __init__(
                 subfolder="vae",
             )
         elif not isinstance(custom_vae, dict):
-            try:
-                # custom HF repo with no vae subfolder
-                self.vae = AutoencoderKL.from_pretrained(
-                    custom_vae,
-                )
-            except:
-                # some larger repo with vae subfolder
-                self.vae = AutoencoderKL.from_pretrained(
-                    custom_vae,
-                    subfolder="vae",
-                )
+            self.vae = AutoencoderKL.from_pretrained(
+                hf_model_name,
+                subfolder="vae",
+            )
+            fp16_weights = hf_hub_download(
+                repo_id=custom_vae,
+                filename="vae/vae.safetensors",
+            )
+            with safe_open(fp16_weights, framework="pt", device="cpu") as f:
+                state_dict = {}
+                for key in f.keys():
+                    state_dict[key] = f.get_tensor(key)
+                self.vae.load_state_dict(state_dict)
         else:
             # custom vae as a HF state dict
             self.vae = AutoencoderKL.from_pretrained(
@@ -143,7 +147,7 @@ def export_vae_model(
         vae_model = SD3VaeModel(hf_model_name)
     else:
         if "xl" in hf_model_name.lower() and precision == "fp16":
-            custom_vae = "madebyollin/sdxl-vae-fp16-fix"
+            custom_vae = "amd-shark/sdxl-quant-models"
         else:
             custom_vae = None
         vae_model = VaeModel(hf_model_name, custom_vae=custom_vae)

From ec4bc05d3175cd49d0c52460c2a16a391ac1b1c8 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 22:41:14 -0500
Subject: [PATCH 170/174] small fixes to ensure specs are used where needed

---
 .../custom_models/sd_inference/sd_pipeline.py            | 9 ++++++---
 models/turbine_models/custom_models/sd_inference/vae.py  | 6 ++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 9cbacc812..8af083527 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -258,9 +258,12 @@ def __init__(
                 sd_model_map[submodel]["export_args"]["height"] = height
                 sd_model_map[submodel]["export_args"]["width"] = width
             if "decomp_attn" in sd_model_map[submodel]["export_args"]:
-                sd_model_map[submodel]["export_args"]["decomp_attn"] = decomp_attn[
-                    submodel
-                ]
+                if isinstance(decomp_attn, bool):
+                    sd_model_map[submodel]["export_args"]["decomp_attn"] = decomp_attn
+                else:
+                    sd_model_map[submodel]["export_args"]["decomp_attn"] = (
+                        decomp_attn.get[submodel, False]
+                    )
         super().__init__(
             sd_model_map,
             device,
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 5ec326505..7ccd12c48 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -127,6 +127,12 @@ def export_vae_model(
     )
     if decomp_attn:
         safe_name += "_decomp_attn"
+    elif not attn_spec:
+        if "gfx9" in target:
+            attn_spec = "mfma"
+        elif "gfx11" in target:
+            attn_spec = "wmma"
+
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)
 

From 931b70c0e2e3455e2f0be580f4679c6b38caee63 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Thu, 11 Jul 2024 22:56:05 -0500
Subject: [PATCH 171/174] fix typo

---
 models/turbine_models/custom_models/sd_inference/sd_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
index 8af083527..e1d3ae940 100644
--- a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
+++ b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
@@ -262,7 +262,7 @@ def __init__(
                     sd_model_map[submodel]["export_args"]["decomp_attn"] = decomp_attn
                 else:
                     sd_model_map[submodel]["export_args"]["decomp_attn"] = (
-                        decomp_attn.get[submodel, False]
+                        decomp_attn.get(submodel, False)
                     )
         super().__init__(
             sd_model_map,

From 7e580c9257e74fedd4c67c1609af88d467fce1da Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Thu, 11 Jul 2024 23:33:32 -0500
Subject: [PATCH 172/174] Update models/requirements.txt to install editable
 sharktank package

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 models/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/requirements.txt b/models/requirements.txt
index b7b7d8d2b..0aed40159 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -13,4 +13,4 @@ einops
 pytest
 scipy
 shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
-sharktank @ git+https://github.com/nod-ai/sharktank@main#subdirectory=sharktank
+-e git+https://github.com/nod-ai/sharktank.git@main#egg=sharktank&subdirectory=sharktank

From b58d16ad5b903f97d57250029f79b3a26b217068 Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 12 Jul 2024 00:42:32 -0500
Subject: [PATCH 173/174] Fix guidance conditional on cpu scheduler init.

---
 models/turbine_models/custom_models/sd_inference/schedulers.py | 3 ++-
 models/turbine_models/tests/sdxl_test.py                       | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index c638e05fa..0a6e36cc1 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -139,6 +139,7 @@ def __init__(
         self.dest = dest_device
         self.batch_size = batch_size
         self.timesteps = None
+        self.do_classifier_free_guidance = True
         self.do_guidance = True
         self.repeat_sample = True
 
@@ -166,7 +167,7 @@ def initialize_sdxl(self, sample, num_inference_steps):
         crops_coords_top_left = (0, 0)
         add_time_ids = list(original_size + crops_coords_top_left + target_size)
         add_time_ids = torch.tensor([add_time_ids], dtype=self.torch_dtype)
-        if self.do_guidance:
+        if self.do_classifier_free_guidance:
             add_time_ids = torch.cat([add_time_ids] * 2, dim=0)
             add_time_ids = add_time_ids.repeat(self.batch_size, 1).type(
                 self.torch_dtype
diff --git a/models/turbine_models/tests/sdxl_test.py b/models/turbine_models/tests/sdxl_test.py
index da9dfdafe..216b6ff59 100644
--- a/models/turbine_models/tests/sdxl_test.py
+++ b/models/turbine_models/tests/sdxl_test.py
@@ -7,6 +7,7 @@
 import logging
 import pytest
 import torch
+import shutil
 from transformers import CLIPTokenizer
 from turbine_models.custom_models.sd_inference.utils import create_safe_name
 from turbine_models.custom_models.sd_inference import schedulers, vae

From c035f61ea8f93ea5e9ea7dc45ac905a71088938a Mon Sep 17 00:00:00 2001
From: Ean Garvey <ean.garvey@amd.com>
Date: Fri, 12 Jul 2024 02:21:10 -0500
Subject: [PATCH 174/174] Use the correct variable in input metdata for prompt
 encoder with bs>1

---
 .../custom_models/sdxl_inference/sdxl_prompt_encoder.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
index 7c34b0c29..d579c3419 100644
--- a/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
+++ b/models/turbine_models/custom_models/sdxl_inference/sdxl_prompt_encoder.py
@@ -269,7 +269,7 @@ def encode_prompts_turbo(
 
     model_metadata_encode = {
         "model_name": hf_model_name + "_text_encoder",
-        "input_shapes": [str((batch_size, max_length)) for i in range(4)],
+        "input_shapes": [str((input_batchsize, max_length)) for i in range(4)],
         "input_dtypes": ["int64" for i in range(4)],
         "use_attention_mask": False,
     }