Add TOSA tensor broadcast and mixed precision tests (#609)

* Add the following TOSA integration tests to test/Integration/Dialect/TOSA/ * List of PASS tests: i16xi16_add_elem (lane=32) i16xi16_mul_elem (lane=32) i16xi16_sel (lane=32) i16xi16_sub_elem (lane=32) i8xi8_add_elem (lane=64) i8xi8_mul_elem (lane=32) i8xi8_sel (lane=64) i8xi8_sub_elem (lane=64) bf16xbf16_sub_elem_2d_broadcast_1d (lane=16) bf16xbf16_sub_elem_2d_broadcast_1d_reshape (lane=16) * List of XFAIL tests: i8xi16_sub_elem (lane=32) bf16xbf16_sub_elem_2d_broadcast_2d (lane=16) bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim (lane=16)
Xilinx · Aug 30, 2023 · 481f361 · 481f361
1 parent d60154a
commit 481f361
Show file tree

Hide file tree

Showing 51 changed files with 1,175 additions and 15 deletions.
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_add_elem/dut.cc b/test/Integration/Dialect/TOSA/bf16xbf16_add_elem/dut.cc
@@ -1,3 +1,4 @@
+// clang-format off
 void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
   size_t v4 = 0;
   size_t v5 = 1024;
@@ -14,3 +15,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
     }
   return;
 }
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_mul_elem/dut.cc b/test/Integration/Dialect/TOSA/bf16xbf16_mul_elem/dut.cc
@@ -1,3 +1,4 @@
+// clang-format off
 void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
   size_t v4 = 16;
   size_t v5 = 1024;
@@ -17,3 +18,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
     }
   return;
 }
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sel/dut.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sel/dut.cc
@@ -1,3 +1,4 @@
+// clang-format off
 void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
   size_t v4 = 0;
   size_t v5 = 1024;
@@ -12,3 +13,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
     }
   return;
 }
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem/dut.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem/dut.cc
@@ -1,3 +1,4 @@
+// clang-format off
 void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
   size_t v4 = 0;
   size_t v5 = 1024;
@@ -14,3 +15,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
     }
   return;
 }
+// clang-format on
diff --git a/...n/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/bf16xbf16_sub_elem_2d_broadcast_1d.mlir b/...n/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/bf16xbf16_sub_elem_2d_broadcast_1d.mlir
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1024xbf16>) -> (tensor<16x1024xbf16>) {
+    %1 = "tosa.sub"(%arg0,%arg1) : (tensor<16x1024xbf16>, tensor<1024xbf16>)  -> (tensor<16x1024xbf16>)
+    return %1 : tensor<16x1024xbf16>
+  }
+}
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/defines.h b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 16 * 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 16 * 1024;
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/dut.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/dut.cc
@@ -0,0 +1,28 @@
+// clang-format off
+void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 16;
+  size_t v6 = 1;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(16, 16)
+  {
+    size_t v8 = 0;
+    size_t v9 = 1024;
+    size_t v10 = 16;
+    for (size_t v11 = v8; v11 < v9; v11 += v10)
+    chess_prepare_for_pipelining
+    chess_loop_range(64, 64)
+    {
+      v16bfloat16 v12 = *(v16bfloat16 *)(v1 + 1024*v7+v11);
+      v16bfloat16 v13 = *(v16bfloat16 *)(v2 + v11);
+      v16accfloat v14 = ups_to_v16accfloat(v12);
+      v16accfloat v15 = ups_to_v16accfloat(v13);
+      v16accfloat v16 = sub(v14, v15);
+      v16bfloat16 v17 = to_v16bfloat16(v16);
+      *(v16bfloat16 *)(v3 + 1024*v7+v11) = v17;
+    }
+  }
+  return;
+}
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/testbench.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d/testbench.cc
@@ -0,0 +1,56 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
+         bfloat16 *restrict out0);
+void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);
+
+alignas(32) bfloat16 g_in0[IN0_SIZE];
+alignas(32) bfloat16 g_in1[IN1_SIZE];
+alignas(32) bfloat16 g_out0[OUT0_SIZE];
+alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_bfloat16(-10, 10, 2); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_bfloat16(-10, 10, 2); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] - in1[k % IN1_SIZE];
+  }
+}
diff --git a/...f16xbf16_sub_elem_2d_broadcast_1d_reshape/bf16xbf16_sub_elem_2d_broadcast_1d_reshape.mlir b/...f16xbf16_sub_elem_2d_broadcast_1d_reshape/bf16xbf16_sub_elem_2d_broadcast_1d_reshape.mlir
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1024xbf16>) -> (tensor<16x1024xbf16>) {
+    %0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1024>} : (tensor<1024xbf16>)  -> (tensor<1x1024xbf16>)
+    %1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xbf16>, tensor<1x1024xbf16>)  -> (tensor<16x1024xbf16>)
+    return %1 : tensor<16x1024xbf16>
+  }
+}
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_reshape/defines.h b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_reshape/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 16 * 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 16 * 1024;
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_reshape/dut.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_reshape/dut.cc
@@ -0,0 +1,28 @@
+// clang-format off
+void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 16;
+  size_t v6 = 1;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(16, 16)
+  {
+    size_t v8 = 0;
+    size_t v9 = 1024;
+    size_t v10 = 16;
+    for (size_t v11 = v8; v11 < v9; v11 += v10)
+    chess_prepare_for_pipelining
+    chess_loop_range(64, 64)
+    {
+      v16bfloat16 v12 = *(v16bfloat16 *)(v1 + 1024*v7+v11);
+      v16bfloat16 v13 = *(v16bfloat16 *)(v2 + v11);
+      v16accfloat v14 = ups_to_v16accfloat(v12);
+      v16accfloat v15 = ups_to_v16accfloat(v13);
+      v16accfloat v16 = sub(v14, v15);
+      v16bfloat16 v17 = to_v16bfloat16(v16);
+      *(v16bfloat16 *)(v3 + 1024*v7+v11) = v17;
+    }
+  }
+  return;
+}
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_reshape/testbench.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_reshape/testbench.cc
@@ -0,0 +1,56 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
+         bfloat16 *restrict out0);
+void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);
+
+alignas(32) bfloat16 g_in0[IN0_SIZE];
+alignas(32) bfloat16 g_in1[IN1_SIZE];
+alignas(32) bfloat16 g_out0[OUT0_SIZE];
+alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_bfloat16(-10, 10, 2); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_bfloat16(-10, 10, 2); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] - in1[k % IN1_SIZE];
+  }
+}
diff --git a/...6xbf16_sub_elem_2d_broadcast_1d_unit_dim/bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim.mlir b/...6xbf16_sub_elem_2d_broadcast_1d_unit_dim/bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim.mlir
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// XFAIL: *
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1xbf16>) -> (tensor<16x1024xbf16>) {
+    %0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1>} : (tensor<1xbf16>)  -> (tensor<1x1xbf16>)
+    %1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xbf16>, tensor<1x1xbf16>)  -> (tensor<16x1024xbf16>)
+    return %1 : tensor<16x1024xbf16>
+  }
+}
+
+
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim/defines.h b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 16 * 1024;
+constexpr unsigned const IN1_SIZE = 1;
+constexpr unsigned const OUT0_SIZE = 16 * 1024;
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim/testbench.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim/testbench.cc
@@ -0,0 +1,56 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
+         bfloat16 *restrict out0);
+void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);
+
+alignas(32) bfloat16 g_in0[IN0_SIZE];
+alignas(32) bfloat16 g_in1[IN1_SIZE];
+alignas(32) bfloat16 g_out0[OUT0_SIZE];
+alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_bfloat16(-10, 10, 2); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_bfloat16(-10, 10, 2); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] - in1[k % IN1_SIZE];
+  }
+}
diff --git a/...n/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_2d/bf16xbf16_sub_elem_2d_broadcast_2d.mlir b/...n/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_2d/bf16xbf16_sub_elem_2d_broadcast_2d.mlir
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// XFAIL: *
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --linalg-fold-unit-extent-dims --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1x1024xbf16>) -> (tensor<16x1024xbf16>) {
+    %1 = "tosa.sub"(%arg0,%arg1) : (tensor<16x1024xbf16>, tensor<1x1024xbf16>)  -> (tensor<16x1024xbf16>)
+    return %1 : tensor<16x1024xbf16>
+  }
+}
diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_2d/defines.h b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_2d/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 16 * 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 16 * 1024;