From 38118dc1736f944f31121296a0681e56df05468a Mon Sep 17 00:00:00 2001
From: Vivian <vivian.zhang@amd.com>
Date: Thu, 31 Oct 2024 02:53:39 -0700
Subject: [PATCH] Bump IREE to 3cf5b65f736ce50c9890190b80e6343c0b929d56 (#863)

Temporarily removed two AIR pad-pack ci tests. @erwei-xilinx will look
into the failure once this patch merged in.
---
 .github/workflows/ci-linux.yml                |   2 +-
 .github/workflows/ci-macos.yml                |   2 +-
 .github/workflows/ci-windows.yml              |   2 +-
 build_tools/ci/cpu_comparison/run.py          |   4 +-
 build_tools/ci/run_matmul_test.sh             |  21 +-
 .../Test/transform_dialect/CMakeLists.txt     |   1 -
 .../transform_dialect/conv_fill_spec_pad.mlir |   4 +-
 .../matmul_fill_spec_pack_funcIR.mlir         | 199 ------------------
 .../matmul_fill_spec_pad.mlir                 |   6 +-
 .../Transforms/AMDAIEConvertToDma.cpp         |  17 +-
 .../driver/xrt-lite/nop_semaphore.cc          |   2 +-
 .../iree-amd-aie/driver/xrt/nop_semaphore.cc  |   6 +-
 third_party/iree                              |   2 +-
 13 files changed, 36 insertions(+), 232 deletions(-)
 delete mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pack_funcIR.mlir

diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index 0077cf5ab..9ce0ff493 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -57,7 +57,7 @@ jobs:
       - name: Python deps
         run: |
           pip install -r third_party/iree/runtime/bindings/python/iree/runtime/build_requirements.txt
-          pip install pyyaml
+          pip install pyyaml pybind11==2.13.6 nanobind==2.2.0
 
       - name: Enable cache
         uses: actions/cache/restore@v3
diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml
index 4871e9745..bcb1a9d7a 100644
--- a/.github/workflows/ci-macos.yml
+++ b/.github/workflows/ci-macos.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Python deps
         run: |
           pip install -r third_party/iree/runtime/bindings/python/iree/runtime/build_requirements.txt
-          pip install pytest
+          pip install pytest pybind11==2.13.6 nanobind==2.2.0
 
       - name: Enable cache
         uses: actions/cache/restore@v3
diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml
index 86b5f4f8a..063207112 100644
--- a/.github/workflows/ci-windows.yml
+++ b/.github/workflows/ci-windows.yml
@@ -81,7 +81,7 @@ jobs:
       - name: Python deps
         run: |
           pip install -r third_party\iree\runtime\bindings\python\iree\runtime\build_requirements.txt
-          pip install pyyaml
+          pip install pyyaml pybind11==2.13.6 nanobind==2.2.0
 
       - name: Enable cache
         uses: actions/cache/restore@v3
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 96f060d4c..9c7e0f1fb 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -537,8 +537,8 @@ def aie_vs_llvm_cpu(
     config,
     test_file,
     use_ukernel=False,
-    tile_pipeline="pad-pack",
-    lower_to_aie_pipeline="air",
+    tile_pipeline="pack-peel",
+    lower_to_aie_pipeline="objectFifo",
     function_name=None,
     seed=1,
     rtol=1e-6,
diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh
index a09870729..bda8b753e 100755
--- a/build_tools/ci/run_matmul_test.sh
+++ b/build_tools/ci/run_matmul_test.sh
@@ -555,16 +555,17 @@ run_matmul_test \
 # MLIR-AIR Matmul tests
 ###################################################################
 
-if [ -d "$VITIS" ]; then
-  run_matmul_test \
-      --name_prefix "ukern" \
-      --lower_to_aie_pipeline "air" \
-      --tile_pipeline "pad-pack" \
-      --lhs_rhs_type "bf16" \
-      --acc_type "f32" \
-      --m "256"  --k "256" --n "256" \
-      --use_ukernel "1"
-fi
+# TODO: re-enable after fixing in AIR
+# if [ -d "$VITIS" ]; then
+#   run_matmul_test \
+#       --name_prefix "ukern" \
+#       --lower_to_aie_pipeline "air" \
+#       --tile_pipeline "pad-pack" \
+#       --lhs_rhs_type "bf16" \
+#       --acc_type "f32" \
+#       --m "256"  --k "256" --n "256" \
+#       --use_ukernel "1"
+# fi
 
 # Example of a run with a group of 2+ matmuls. Currently this test is passed
 # the flag '--num_repeat_runs 0" as there is currently an issue with the runtime if
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/CMakeLists.txt
index 7cc3b7edd..db8e4a397 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/CMakeLists.txt
@@ -9,7 +9,6 @@ iree_lit_test_suite(
     lit
   SRCS
     "conv_fill_spec_pad.mlir"
-    "matmul_fill_spec_pack_funcIR.mlir"
     "matmul_fill_spec_pack_peel.mlir"
     "matmul_fill_spec_pad.mlir"
     "matmul_fill_spec_pad_pack.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/conv_fill_spec_pad.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/conv_fill_spec_pad.mlir
index 38836cc9f..0876c9816 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/conv_fill_spec_pad.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/conv_fill_spec_pad.mlir
@@ -129,7 +129,7 @@ module attributes { transform.with_named_sequence } {
     %padded_1, %pad_1, %___ = transform.structured.pad %tiled_conv_1 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[0, 0, 1],
+      nofold_flags=[0, 0, 1],
       copy_back_op="linalg.copy"
     } : (!any) -> (!any, !any, !any)
 
@@ -163,7 +163,7 @@ module attributes { transform.with_named_sequence } {
     %padded_2, %pad_2, %____ = transform.structured.pad %inner_conv {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0],
+      nofold_flags=[1, 1, 0],
       copy_back_op="linalg.copy"
     } : (!any) -> (!any, !any, !any)
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pack_funcIR.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pack_funcIR.mlir
deleted file mode 100644
index 1ebf0232b..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pack_funcIR.mlir
+++ /dev/null
@@ -1,199 +0,0 @@
-// RUN: iree-opt --iree-transform-dialect-interpreter %s | FileCheck %s
-// This script shows an example lowering matmul through pack based pipeline for AIE device.
-// This script is a prototype for funcIR proposal.
-// In this strategy, we use pack operations for data movement from L3 to L2, and L2 to L1.
-// In order to keep initialization in L1, the first iteration of scf.for loop is peeled.
-
-
-#pipeline_layout = #hal.pipeline.layout<bindings= [
-    #hal.pipeline.binding<storage_buffer, ReadOnly>,
-    #hal.pipeline.binding<storage_buffer, ReadOnly>,
-    #hal.pipeline.binding<storage_buffer>
-]>
-
-func.func @matmul_example() {
-  %c0_i32 = arith.constant 0: i32
-  %c0 = arith.constant 0 : index
-  %arg0_binding = hal.interface.binding.subspan layout(#pipeline_layout)  binding(0) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x256xi8>>
-  %arg0 = flow.dispatch.tensor.load %arg0_binding, offsets = [0, 0], sizes = [16, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x256xi8>> -> tensor<16x256xi8>
-  %arg1_binding = hal.interface.binding.subspan layout(#pipeline_layout)  binding(1) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xi8>>
-  %arg1 = flow.dispatch.tensor.load %arg1_binding, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xi8>> -> tensor<256x256xi8>
-  %arg2_binding = hal.interface.binding.subspan layout(#pipeline_layout)  binding(2) offset(%c0) flags(None) : !flow.dispatch.tensor<writeonly:tensor<16x256xi32>>
-  %empty = tensor.empty() : tensor<16x256xi32>
-  %0 = linalg.fill ins(%c0_i32 : i32) outs(%empty : tensor<16x256xi32>) -> tensor<16x256xi32>
-  %1 = linalg.matmul ins(%arg0, %arg1 : tensor<16x256xi8>, tensor<256x256xi8>)
-      outs(%0 : tensor<16x256xi32>) -> tensor<16x256xi32>
-  flow.dispatch.tensor.store %1, %arg2_binding, offsets = [0, 0], sizes = [16, 256], strides = [1, 1] : tensor<16x256xi32> -> !flow.dispatch.tensor<writeonly:tensor<16x256xi32>>
-  return
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @cleanup(%variant_op: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-      transform.apply_patterns to %func {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func : !transform.any_op
-    transform.apply_cse to %func : !transform.any_op
-    transform.yield
-  }
-
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.read_only}) {
-    %ops = transform.structured.match ops{["linalg.fill", "linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %fill, %matmul = transform.split_handle %ops : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // First level tile to forall with tile_sizes [16, 256] to target 4 cores. Adjust to [16, 64] for 1 core.
-    %tiled_matmul, %forall =
-      transform.structured.tile_using_forall %matmul tile_sizes [16, 256]
-        ( mapping = [#gpu.block<y>, #gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Fuse fill operation into the forall loop.
-    %fused_fill, %_ = transform.structured.fuse_into_containing_op %fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Tile reduction dimension.
-    %tiled_reduction, %loop =
-      transform.structured.tile_using_for %tiled_matmul tile_sizes [0, 0, 64]
-      : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">)
-
-    // Pack by applying data tiling, and the linalg.matmul becomes linalg.generic.
-    %packed = transform.structured.pack %tiled_reduction packed_sizes = [16, 64, 64]
-      : (!transform.any_op) -> (!transform.any_op)
-
-    // Transpose B matrix from [K N n k] to [K N k n]
-    %pack_producer_b0 = transform.get_producer_of_operand %packed[1]
-      : (!transform.any_op) -> (!transform.any_op)
-    %packed_b0, %pack_b0, %empty_unpack_b0 =
-      transform.structured.pack_transpose %pack_producer_b0 with_compute_op(%packed)
-      inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-      -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Bufferize to shared memory allocation
-    %pack_producer_a0 = transform.get_producer_of_operand %packed_b0[0]
-      : (!transform.any_op) -> (!transform.any_op)
-    %pack_producer_c0 = transform.get_producer_of_operand %packed_b0[2]
-      : (!transform.any_op) -> (!transform.any_op)
-    %buffer_a0, %new_a0 = transform.structured.bufferize_to_allocation %pack_b0
-      {memory_space = 1, bufferize_destination_only, emit_dealloc} : !transform.any_op
-    %buffer_b0, %new_b0 = transform.structured.bufferize_to_allocation %pack_producer_a0
-      {memory_space = 1, bufferize_destination_only, emit_dealloc} : !transform.any_op
-    %buffer_c0, %new_c0 = transform.structured.bufferize_to_allocation %pack_producer_c0
-      {memory_space = 1, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Second level tile to forall with tile_sizes [1, 1].
-    %tiled_matmul_1, %forall_1 =
-      transform.structured.tile_using_forall %packed_b0 tile_sizes [1, 1]
-        ( mapping = [#gpu.thread<y>, #gpu.thread<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Pack by applying data tiling, and the linalg.matmul becomes linalg.generic.
-    %packed_2 = transform.structured.pack %tiled_matmul_1 packed_sizes = [0, 0, 0, 4, 8, 8]
-      : (!transform.any_op) -> (!transform.any_op)
-
-    // Transpose A matrix from [M K m k m0 k0] to [M K k m m0 k0]
-    %pack_producer_a = transform.get_producer_of_operand %packed_2[0]
-      : (!transform.any_op) -> (!transform.any_op)
-    %packed_a, %pack_a, %empty_unpack_a =
-      transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed_2)
-      outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
-      -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Transpose B matrix from [K N k n n0 k0] to [K N n k k0 n0]
-    %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-      : (!transform.any_op) -> (!transform.any_op)
-    %packed_b, %pack_b, %empty_unpack_b =
-      transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-      outer_perm = [0, 1, 3, 2] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-      -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Transpose C matrix from [M N m n m0 n0] to [M N n m m0 n0]
-    %unpack = transform.get_consumers_of_result %packed_b[0]
-      : (!transform.any_op) -> (!transform.any_op)
-    %packed_c, %pack_c, %unpack_c =
-      transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-      outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
-      -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Bufferize to local memory allocation
-    %buffer_a, %new_a = transform.structured.bufferize_to_allocation %pack_a
-      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-    %buffer_b, %new_b = transform.structured.bufferize_to_allocation %pack_b
-      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-    %buffer_c, %new_c = transform.structured.bufferize_to_allocation %pack_c
-      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Hoist static alloc out of the loops
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    transform.iree.hoist_static_alloc %memref_func : (!transform.any_op) -> ()
-
-    // Peel the first iteration out of the for loop.
-    // This only works when the for loop has more than one iteration.
-    %1 = transform.get_parent_op %packed_c {op_name = "scf.for"} : (!transform.any_op) -> !transform.op<"scf.for">
-    %main_loop, %remainder = transform.loop.peel %1 {peel_front = true}
-      : (!transform.op<"scf.for">) -> (!transform.any_op, !transform.any_op)
-    transform.include @cleanup failures(propagate) (%variant_op) : (!transform.any_op) -> ()
-
-    // Find the fill and the second forall operations.
-    %fused_fill_1 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %fill_consumer = transform.get_consumers_of_result %fused_fill_1[0] : (!transform.any_op) -> (!transform.any_op)
-
-    // Fuse fill operation into the forall loop
-    %fused_fill_2, %__ = transform.structured.fuse_into_containing_op %fused_fill_1 into %fill_consumer
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Clean up.
-    transform.include @cleanup failures(propagate) (%variant_op) : (!transform.any_op) -> ()
-
-    // Bufferize and drop HAL decriptor from memref ops.
-    %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.eliminate_empty_tensors %func_op : (!transform.any_op) -> ()
-    %memref_func_2 = transform.iree.bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: @matmul_example
-//       CHECK: memref.alloc() : memref<1x1x8x4x4x8xi32, 2>
-//       CHECK: memref.alloc() : memref<1x1x8x8x8x8xi8, 2>
-//       CHECK: memref.alloc() : memref<1x1x8x4x4x8xi8, 2>
-//       CHECK: memref.alloc() : memref<1x4x16x64xi32, 1>
-//       CHECK: memref.alloc() : memref<1x4x64x64xi8, 1>
-//       CHECK: memref.alloc() : memref<1x1x16x64xi8, 1>
-//       CHECK: scf.forall
-//       CHECK: {
-//       CHECK:   iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1>)
-//       CHECK:   iree_linalg_ext.pack %{{.*}} : (memref<64x256xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x64x64xi8, 1>)
-//       CHECK:   scf.forall
-//       CHECK:   {
-//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1> memref<1x1x8x4x4x8xi8, 2>)
-//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[16384, 4096, 64, 1], offset: ?>, 1> memref<1x1x8x8x8x8xi8, 2>)
-//       CHECK:     linalg.fill ins(%{{.*}}) outs(%{{.*}} : memref<1x1x8x4x4x8xi32, 2>)
-//       CHECK:     linalg.generic
-//       CHECK:     iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2> memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1>)
-//       CHECK:   }
-//       CHECK:   iree_linalg_ext.unpack %{{.*}} : (memref<1x4x16x64xi32, 1> memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
-//       CHECK:   scf.for
-//       CHECK:   {
-//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1>)
-//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<64x256xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x64x64xi8, 1>)
-//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x16x64xi32, 1>)
-//       CHECK:     scf.forall
-//       CHECK:     {
-//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1> memref<1x1x8x4x4x8xi8, 2>)
-//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[16384, 4096, 64, 1], offset: ?>, 1> memref<1x1x8x8x8x8xi8, 2>)
-//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1> memref<1x1x8x4x4x8xi32, 2>)
-//       CHECK:       linalg.generic
-//       CHECK:       iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2> memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1>)
-//       CHECK:     }
-//       CHECK:     iree_linalg_ext.unpack %{{.*}} : (memref<1x4x16x64xi32, 1> memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
-//       CHECK:   }
-//       CHECK: }
-//       CHECK: memref.dealloc %{{.*}} : memref<1x1x16x64xi8, 1>
-//       CHECK: memref.dealloc %{{.*}} : memref<1x4x64x64xi8, 1>
-//       CHECK: memref.dealloc %{{.*}} : memref<1x4x16x64xi32, 1>
-//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi8, 2>
-//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x8x8x8xi8, 2>
-//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi32, 2>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad.mlir
index 9c1be1ef0..b508c4768 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad.mlir
@@ -145,9 +145,9 @@ module attributes { transform.with_named_sequence } {
 //       CHECK: scf.forall
 //       CHECK: {
 //       CHECK:   memref.alloc() : memref<8x16xi32, 1>
-//       CHECK:   linalg.copy ins(%{{.*}} : memref<8x16xi32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%{{.*}} : memref<8x16xi32, 1>)
+//       CHECK:   linalg.copy ins(%{{.*}} : memref<8x16xi32, #hal.descriptor_type<storage_buffer>>) outs(%{{.*}} : memref<8x16xi32, 1>)
 //       CHECK:   memref.alloc() : memref<16x8xi32, 1>
-//       CHECK:   linalg.copy ins(%{{.*}} : memref<16x8xi32, strided<[8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%{{.*}} : memref<16x8xi32, 1>)
+//       CHECK:   linalg.copy ins(%{{.*}} : memref<16x8xi32, #hal.descriptor_type<storage_buffer>>) outs(%{{.*}} : memref<16x8xi32, 1>)
 //       CHECK:   memref.alloc() : memref<8x8xi32, 1>
 //       CHECK:   scf.forall
 //       CHECK:   {
@@ -166,7 +166,7 @@ module attributes { transform.with_named_sequence } {
 //       CHECK:     linalg.copy ins(%{{.*}} : memref<4x4xi32, 2>) outs(%{{.*}} : memref<4x4xi32, strided<[8, 1], offset: ?>, 1>)
 //       CHECK:     memref.dealloc %{{.*}} : memref<4x4xi32, 2>
 //       CHECK:   }
-//       CHECK:   linalg.copy ins(%{{.*}} : memref<8x8xi32, 1>) outs(%{{.*}} : memref<8x8xi32, strided<[8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
+//       CHECK:   linalg.copy ins(%{{.*}} : memref<8x8xi32, 1>) outs(%{{.*}} : memref<8x8xi32, #hal.descriptor_type<storage_buffer>>)
 //       CHECK:   memref.dealloc %{{.*}} : memref<8x16xi32, 1>
 //       CHECK:   memref.dealloc %{{.*}} : memref<16x8xi32, 1>
 //       CHECK:   memref.dealloc %{{.*}} : memref<8x8xi32, 1>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp
index 682218480..a76aa9aff 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp
@@ -160,21 +160,23 @@ LogicalResult setDmaInputs(Operation *&operandOp,
                            SmallVector<OpFoldResult> &sizes,
                            SmallVector<OpFoldResult> &strides) {
   MLIRContext *ctx = operandOp->getContext();
-  if (auto allocOp = dyn_cast<memref::AllocOp>(operandOp)) {
-    auto [stridesI64, baseOffset] = getStridesAndOffset(allocOp.getType());
+  if (isa<memref::AllocOp>(operandOp) ||
+      isa<IREE::HAL::InterfaceBindingSubspanOp>(operandOp)) {
+    MemRefType memRefType = cast<MemRefType>(operandOp->getResult(0).getType());
+    auto [stridesI64, baseOffset] = getStridesAndOffset(memRefType);
     if (baseOffset != 0) {
       auto message = llvm::formatv(
           "with non-zero base offset {0} is not supported by the "
           "current pass, requires testing and possible code changes.",
           baseOffset);
-      return allocOp->emitOpError(message);
+      return operandOp->emitOpError(message);
     }
     strides = getAsIndexOpFoldResult(ctx, stridesI64);
-    auto sizesI64 = allocOp.getType().getShape();
+    auto sizesI64 = memRefType.getShape();
     if (llvm::any_of(sizesI64, [](int64_t size) {
           return ShapedType::isDynamic(size);
         })) {
-      return allocOp->emitOpError(
+      return operandOp->emitOpError(
           "with dynamic shape is not supported by dma op.");
     }
     sizes = getAsIndexOpFoldResult(ctx, sizesI64);
@@ -235,8 +237,9 @@ LogicalResult setDmaInputs(Operation *&operandOp,
     return success();
   }
   return operandOp->emitOpError(
-      "is an unsupported operation. This pass currently only supports AllocOp "
-      "and SubViewOp as inputs.");
+      "is an unsupported operation. This pass currently only supports "
+      "hal.interface.binding.subspan, memref.alloc and memref.subview as "
+      "inputs.");
 }
 
 /// Rewrite the pack/unpack op 'op' as a DMA operation. The function arguments
diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc
index aedd01453..16fb35e7e 100644
--- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc
+++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc
@@ -23,7 +23,7 @@ struct iree_hal_xrt_lite_semaphore {
                               iree_allocator_t host_allocator)
       : value(initial_value), host_allocator(host_allocator) {
     iree_hal_semaphore_initialize(&iree_hal_xrt_lite_semaphore_vtable, &base);
-    iree_atomic_store_int64(&value, initial_value, iree_memory_order_release);
+    iree_atomic_store(&value, initial_value, iree_memory_order_release);
   }
 };
 
diff --git a/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc
index b5f44f6ab..d66578fd6 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc
+++ b/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc
@@ -40,7 +40,7 @@ iree_status_t iree_hal_xrt_semaphore_create(
     iree_hal_semaphore_initialize(&iree_hal_xrt_semaphore_vtable,
                                   &semaphore->base);
     semaphore->host_allocator = host_allocator;
-    iree_atomic_store_int64(&semaphore->value, initial_value,
+    iree_atomic_store(&semaphore->value, initial_value,
                             iree_memory_order_release);
     *out_semaphore = &semaphore->base;
   }
@@ -68,7 +68,7 @@ static iree_status_t iree_hal_xrt_semaphore_query(
       iree_hal_xrt_semaphore_cast(base_semaphore);
   // TODO: Support semaphores completely.
   *out_value =
-      iree_atomic_load_int64(&semaphore->value, iree_memory_order_acquire);
+      iree_atomic_load(&semaphore->value, iree_memory_order_acquire);
   return iree_ok_status();
 }
 
@@ -78,7 +78,7 @@ static iree_status_t iree_hal_xrt_semaphore_signal(
       iree_hal_xrt_semaphore_cast(base_semaphore);
   // TODO: Support semaphores completely. Return OK currently as everything is
   // synchronized for each submit to allow things to run.
-  iree_atomic_store_int64(&semaphore->value, new_value,
+  iree_atomic_store(&semaphore->value, new_value,
                           iree_memory_order_release);
   iree_hal_semaphore_poll(&semaphore->base);
   return iree_ok_status();
diff --git a/third_party/iree b/third_party/iree
index df5e5aab0..3cf5b65f7 160000
--- a/third_party/iree
+++ b/third_party/iree
@@ -1 +1 @@
-Subproject commit df5e5aab044ed5b6c5860b0b291c95eafe1c2522
+Subproject commit 3cf5b65f736ce50c9890190b80e6343c0b929d56