-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support of broadcast with vector width = 256 or 1024 and fix TOSA…
… tests (#653) *Add support of broadcast_elem/broadcast_to_vxx for vector width == 256 (e.g. v16bf16) or 1024 (e.g. v32int32). *Since we lower vector.broadcast op to multiple aievec ops, we have to fix FoldMulAddChainToConv pass to recognize the new aievec.broadcast patterns *Add the following list of PASS tests for implicit broadcast: i32xi32_sub_elem_16x1024_broadcast_1 i32xi32_sub_elem_2d_broadcast_1d_unit_dim_v16 (out=i32, lane=16) i32xi32_sub_elem_2d_broadcast_1d_unit_dim_v32 (out=i32, lane=32) i32xi32_sub_elem_2d_broadcast_scalar_v16 (out=i32, lane=16) i32xi32_sub_elem_2d_broadcast_scalar_v32 (out=i32, lane=32) i32xi32_sub_elem_16x1024_broadcast_1024 i32xi32_sub_elem_2d_broadcast_1d_reshape_v16 (out=i32, lane=16) i32xi32_sub_elem_2d_broadcast_1d_reshape_v32 (out=i32, lane=32) i32xi32_sub_elem_2d_broadcast_1d_v16 (out=i32, lane=16) i32xi32_sub_elem_2d_broadcast_1d_v32 (out=i32, lane=32) i32xi32_sub_elem_2d_broadcast_2d_v16 (out=i32, lane=16) i32xi32_sub_elem_2d_broadcast_2d_v32 (out=i32, lane=32) *Add dut.cc reference for bf16xbf16_sub_elem_16x1024_broadcast_1 tests. The resulting dut.cc is legal, but it's blocked by "broadcast_elem() of v32bfloat16" bug. Hence, the tests are still marked XFAIL. *Add conversion test coverage for aievec.broadcast and aievec.broadcast_scalar in test_broadcast.mlir *Fix i8xi16_mul_elem_v32 mlir script
- Loading branch information
1 parent
afe87cb
commit cd3f907
Showing
29 changed files
with
831 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
34 changes: 34 additions & 0 deletions
34
test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_16x1024_broadcast_1/dut.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
// Cycle count: 1111 | ||
// clang-format off | ||
void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) { | ||
size_t v4 = 0; | ||
bfloat16 * restrict v5 = v2; | ||
v16bfloat16 v6 = *(v16bfloat16 *)(v5 + v4+v4); | ||
v32bfloat16 v7 = concat(v6, v6); | ||
v32bfloat16 v8 = broadcast_elem(v7, 0); | ||
v16bfloat16 v9 = extract_v16bfloat16(v8, 0); | ||
v16accfloat v10 = ups_to_v16accfloat(v9); | ||
size_t v11 = 0; | ||
size_t v12 = 16; | ||
size_t v13 = 1; | ||
for (size_t v14 = v11; v14 < v12; v14 += v13) | ||
chess_prepare_for_pipelining | ||
chess_loop_range(16, 16) | ||
{ | ||
size_t v15 = 0; | ||
size_t v16 = 1024; | ||
size_t v17 = 16; | ||
for (size_t v18 = v15; v18 < v16; v18 += v17) | ||
chess_prepare_for_pipelining | ||
chess_loop_range(64, 64) | ||
{ | ||
v16bfloat16 v19 = *(v16bfloat16 *)(v1 + 1024*v14+v18); | ||
v16accfloat v20 = ups_to_v16accfloat(v19); | ||
v16accfloat v21 = sub(v20, v10); | ||
v16bfloat16 v22 = to_v16bfloat16(v21); | ||
*(v16bfloat16 *)(v3 + 1024*v14+v18) = v22; | ||
} | ||
} | ||
return; | ||
} | ||
// clang-format on |
4 changes: 4 additions & 0 deletions
4
test/Integration/Dialect/TOSA/i32xi32_sub_elem_16x1024_broadcast_1/defines.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#pragma once | ||
constexpr unsigned const IN0_SIZE = 16 * 1024; | ||
constexpr unsigned const IN1_SIZE = 1; | ||
constexpr unsigned const OUT0_SIZE = 16 * 1024; |
29 changes: 29 additions & 0 deletions
29
...i32xi32_sub_elem_16x1024_broadcast_1/i32xi32_sub_elem_2d_broadcast_1d_unit_dim_v16/dut.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
// Cycle count: 2131 | ||
// clang-format off | ||
void dut(int32_t * restrict v1, int32_t * restrict v2, int32_t * restrict v3) { | ||
size_t v4 = 0; | ||
int32_t * restrict v5 = v2; | ||
v16int32 v6 = *(v16int32 *)(v5 + v4+v4); | ||
v16int32 v7 = broadcast_elem(v6, 0); | ||
size_t v8 = 0; | ||
size_t v9 = 16; | ||
size_t v10 = 1; | ||
for (size_t v11 = v8; v11 < v9; v11 += v10) | ||
chess_prepare_for_pipelining | ||
chess_loop_range(16, 16) | ||
{ | ||
size_t v12 = 0; | ||
size_t v13 = 1024; | ||
size_t v14 = 16; | ||
for (size_t v15 = v12; v15 < v13; v15 += v14) | ||
chess_prepare_for_pipelining | ||
chess_loop_range(64, 64) | ||
{ | ||
v16int32 v16 = *(v16int32 *)(v1 + 1024*v11+v15); | ||
v16int32 v17 = sub(v16, v7); | ||
*(v16int32 *)(v3 + 1024*v11+v15) = v17; | ||
} | ||
} | ||
return; | ||
} | ||
// clang-format on |
23 changes: 23 additions & 0 deletions
23
..._sub_elem_2d_broadcast_1d_unit_dim_v16/i32xi32_sub_elem_2d_broadcast_1d_unit_dim_v16.mlir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// Copyright (C) 2023, Advanced Micro Devices, Inc. | ||
|
||
// REQUIRES: valid_xchess_license | ||
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor))" -o linalg.mlir | ||
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir | ||
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir | ||
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc | ||
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/../testbench.cc dut.cc | ||
// RUN: mkdir -p data | ||
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout | ||
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s | ||
// CHECK: TEST PASSED | ||
|
||
module { | ||
func.func @dut(%arg0: tensor<16x1024xi32>, %arg1: tensor<1xi32>) -> (tensor<16x1024xi32>) { | ||
%0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1>} : (tensor<1xi32>) -> (tensor<1x1xi32>) | ||
%1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xi32>, tensor<1x1xi32>) -> (tensor<16x1024xi32>) | ||
return %1 : tensor<16x1024xi32> | ||
} | ||
} | ||
|
||
|
33 changes: 33 additions & 0 deletions
33
...i32xi32_sub_elem_16x1024_broadcast_1/i32xi32_sub_elem_2d_broadcast_1d_unit_dim_v32/dut.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
// Cycle count: 2148 | ||
// clang-format off | ||
void dut(int32_t * restrict v1, int32_t * restrict v2, int32_t * restrict v3) { | ||
size_t v4 = 0; | ||
int32_t * restrict v5 = v2; | ||
v16int32 v6 = *(v16int32 *)(v5 + v4+v4); | ||
v16int32 v7 = broadcast_elem(v6, 0); | ||
v32int32 v8 = concat(v7, v7); | ||
v32acc32 v9 = v32acc32(v8); | ||
size_t v10 = 0; | ||
size_t v11 = 16; | ||
size_t v12 = 1; | ||
for (size_t v13 = v10; v13 < v11; v13 += v12) | ||
chess_prepare_for_pipelining | ||
chess_loop_range(16, 16) | ||
{ | ||
size_t v14 = 0; | ||
size_t v15 = 1024; | ||
size_t v16 = 32; | ||
for (size_t v17 = v14; v17 < v15; v17 += v16) | ||
chess_prepare_for_pipelining | ||
chess_loop_range(32, 32) | ||
{ | ||
v32int32 v18 = *(v32int32 *)(v1 + 1024*v13+v17); | ||
v32acc32 v19 = v32acc32(v18); | ||
v32acc32 v20 = sub(v19, v9); | ||
v32int32 v21 = v32int32(v20); | ||
*(v32int32 *)(v3 + 1024*v13+v17) = v21; | ||
} | ||
} | ||
return; | ||
} | ||
// clang-format on |
Oops, something went wrong.