Skip to content

Commit

Permalink
Add TOSA tensor broadcast and mixed precision tests (#609)
Browse files Browse the repository at this point in the history
* Add the following TOSA integration tests to test/Integration/Dialect/TOSA/
* List of PASS tests:
i16xi16_add_elem (lane=32)
i16xi16_mul_elem (lane=32)
i16xi16_sel (lane=32)
i16xi16_sub_elem (lane=32)
i8xi8_add_elem (lane=64)
i8xi8_mul_elem (lane=32)
i8xi8_sel (lane=64)
i8xi8_sub_elem (lane=64)
bf16xbf16_sub_elem_2d_broadcast_1d (lane=16)
bf16xbf16_sub_elem_2d_broadcast_1d_reshape (lane=16)
* List of XFAIL tests:
i8xi16_sub_elem (lane=32)
bf16xbf16_sub_elem_2d_broadcast_2d (lane=16)
bf16xbf16_sub_elem_2d_broadcast_1d_unit_dim (lane=16)
  • Loading branch information
jamestcl-amd authored Aug 30, 2023
1 parent d60154a commit 481f361
Show file tree
Hide file tree
Showing 51 changed files with 1,175 additions and 15 deletions.
2 changes: 2 additions & 0 deletions test/Integration/Dialect/TOSA/bf16xbf16_add_elem/dut.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// clang-format off
void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
size_t v4 = 0;
size_t v5 = 1024;
Expand All @@ -14,3 +15,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
}
return;
}
// clang-format on
2 changes: 2 additions & 0 deletions test/Integration/Dialect/TOSA/bf16xbf16_mul_elem/dut.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// clang-format off
void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
size_t v4 = 16;
size_t v5 = 1024;
Expand All @@ -17,3 +18,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
}
return;
}
// clang-format on
2 changes: 2 additions & 0 deletions test/Integration/Dialect/TOSA/bf16xbf16_sel/dut.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// clang-format off
void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
size_t v4 = 0;
size_t v5 = 1024;
Expand All @@ -12,3 +13,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
}
return;
}
// clang-format on
2 changes: 2 additions & 0 deletions test/Integration/Dialect/TOSA/bf16xbf16_sub_elem/dut.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// clang-format off
void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
size_t v4 = 0;
size_t v5 = 1024;
Expand All @@ -14,3 +15,4 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
}
return;
}
// clang-format on
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1024xbf16>) -> (tensor<16x1024xbf16>) {
%1 = "tosa.sub"(%arg0,%arg1) : (tensor<16x1024xbf16>, tensor<1024xbf16>) -> (tensor<16x1024xbf16>)
return %1 : tensor<16x1024xbf16>
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once
constexpr unsigned const IN0_SIZE = 16 * 1024;
constexpr unsigned const IN1_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 16 * 1024;
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// clang-format off
void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) {
size_t v4 = 0;
size_t v5 = 16;
size_t v6 = 1;
for (size_t v7 = v4; v7 < v5; v7 += v6)
chess_prepare_for_pipelining
chess_loop_range(16, 16)
{
size_t v8 = 0;
size_t v9 = 1024;
size_t v10 = 16;
for (size_t v11 = v8; v11 < v9; v11 += v10)
chess_prepare_for_pipelining
chess_loop_range(64, 64)
{
v16bfloat16 v12 = *(v16bfloat16 *)(v1 + 1024*v7+v11);
v16bfloat16 v13 = *(v16bfloat16 *)(v2 + v11);
v16accfloat v14 = ups_to_v16accfloat(v12);
v16accfloat v15 = ups_to_v16accfloat(v13);
v16accfloat v16 = sub(v14, v15);
v16bfloat16 v17 = to_v16bfloat16(v16);
*(v16bfloat16 *)(v3 + 1024*v7+v11) = v17;
}
}
return;
}
// clang-format on
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
bfloat16 *restrict out0);
void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);

alignas(32) bfloat16 g_in0[IN0_SIZE];
alignas(32) bfloat16 g_in1[IN1_SIZE];
alignas(32) bfloat16 g_out0[OUT0_SIZE];
alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });
std::generate(g_in1, g_in1 + IN1_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_in1, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");

dut_ref(g_in0, g_in1, g_out0Ref);
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = in0[k] - in1[k % IN1_SIZE];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1024xbf16>) -> (tensor<16x1024xbf16>) {
%0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1024>} : (tensor<1024xbf16>) -> (tensor<1x1024xbf16>)
%1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xbf16>, tensor<1x1024xbf16>) -> (tensor<16x1024xbf16>)
return %1 : tensor<16x1024xbf16>
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once
constexpr unsigned const IN0_SIZE = 16 * 1024;
constexpr unsigned const IN1_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 16 * 1024;
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// clang-format off
void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) {
size_t v4 = 0;
size_t v5 = 16;
size_t v6 = 1;
for (size_t v7 = v4; v7 < v5; v7 += v6)
chess_prepare_for_pipelining
chess_loop_range(16, 16)
{
size_t v8 = 0;
size_t v9 = 1024;
size_t v10 = 16;
for (size_t v11 = v8; v11 < v9; v11 += v10)
chess_prepare_for_pipelining
chess_loop_range(64, 64)
{
v16bfloat16 v12 = *(v16bfloat16 *)(v1 + 1024*v7+v11);
v16bfloat16 v13 = *(v16bfloat16 *)(v2 + v11);
v16accfloat v14 = ups_to_v16accfloat(v12);
v16accfloat v15 = ups_to_v16accfloat(v13);
v16accfloat v16 = sub(v14, v15);
v16bfloat16 v17 = to_v16bfloat16(v16);
*(v16bfloat16 *)(v3 + 1024*v7+v11) = v17;
}
}
return;
}
// clang-format on
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
bfloat16 *restrict out0);
void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);

alignas(32) bfloat16 g_in0[IN0_SIZE];
alignas(32) bfloat16 g_in1[IN1_SIZE];
alignas(32) bfloat16 g_out0[OUT0_SIZE];
alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });
std::generate(g_in1, g_in1 + IN1_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_in1, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");

dut_ref(g_in0, g_in1, g_out0Ref);
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = in0[k] - in1[k % IN1_SIZE];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// XFAIL: *
// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1xbf16>) -> (tensor<16x1024xbf16>) {
%0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1>} : (tensor<1xbf16>) -> (tensor<1x1xbf16>)
%1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xbf16>, tensor<1x1xbf16>) -> (tensor<16x1024xbf16>)
return %1 : tensor<16x1024xbf16>
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once
constexpr unsigned const IN0_SIZE = 16 * 1024;
constexpr unsigned const IN1_SIZE = 1;
constexpr unsigned const OUT0_SIZE = 16 * 1024;
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
bfloat16 *restrict out0);
void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);

alignas(32) bfloat16 g_in0[IN0_SIZE];
alignas(32) bfloat16 g_in1[IN1_SIZE];
alignas(32) bfloat16 g_out0[OUT0_SIZE];
alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });
std::generate(g_in1, g_in1 + IN1_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_in1, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");

dut_ref(g_in0, g_in1, g_out0Ref);
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = in0[k] - in1[k % IN1_SIZE];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// XFAIL: *
// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --linalg-fold-unit-extent-dims --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<1x1024xbf16>) -> (tensor<16x1024xbf16>) {
%1 = "tosa.sub"(%arg0,%arg1) : (tensor<16x1024xbf16>, tensor<1x1024xbf16>) -> (tensor<16x1024xbf16>)
return %1 : tensor<16x1024xbf16>
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once
constexpr unsigned const IN0_SIZE = 16 * 1024;
constexpr unsigned const IN1_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 16 * 1024;
Loading

0 comments on commit 481f361

Please sign in to comment.