forked from buddy-compiler/buddy-mlir
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[examples] Add several examples of
vector
dialect on GPU (buddy-com…
…piler#442) * feat: Add store, load, and bitcast * feat: Add constant_mask * feat: Add constant-mask, contract, create-mask, extract, fma, gather, splat * feat: Add insert, outerproduct and transpose * feat: Add reduction * feat: Add shape-cast and type-cast * feat: refine makefile. * feat: exclude MLIRVectorGPU from lit configuration * fix: 1. add missing newlines at the end of multiple MLIR files. 2. refine several binarys' path in the makefile.
- Loading branch information
Showing
18 changed files
with
1,691 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
module attributes {gpu.container_module} { | ||
gpu.module @kernels { | ||
gpu.func @vector_bitcast(%ret0: memref<3xi64>, %ret1: memref<6xf32>, %ret2: memref<6xi32>) kernel { | ||
%c0 = arith.constant 0 : index | ||
%v0 = arith.constant dense<[10, 20, 56, 90, 12, 90]> : vector<6xi32> | ||
%v1 = vector.bitcast %v0 : vector<6xi32> to vector<3xi64> | ||
vector.store %v1, %ret0[%c0] : memref<3xi64>, vector<3xi64> | ||
|
||
%v2 = vector.bitcast %v0 : vector<6xi32> to vector<6xf32> | ||
vector.store %v2, %ret1[%c0] : memref<6xf32>, vector<6xf32> | ||
|
||
%v3 = vector.bitcast %v2 : vector<6xf32> to vector<6xi32> | ||
vector.store %v3, %ret2[%c0] : memref<6xi32>, vector<6xi32> | ||
|
||
gpu.return | ||
} | ||
} | ||
|
||
func.func @main() { | ||
%c1 = arith.constant 1 : index | ||
%kernel_ret0 = memref.alloc() : memref<3xi64> | ||
%kernel_ret0_cast = memref.cast %kernel_ret0 : memref<3xi64> to memref<*xi64> | ||
|
||
%kernel_ret1 = memref.alloc() : memref<6xf32> | ||
%kernel_ret1_cast = memref.cast %kernel_ret1 : memref<6xf32> to memref<*xf32> | ||
|
||
%kernel_ret2 = memref.alloc() : memref<6xi32> | ||
%kernel_ret2_cast = memref.cast %kernel_ret2 : memref<6xi32> to memref<*xi32> | ||
|
||
gpu.host_register %kernel_ret0_cast : memref<*xi64> | ||
gpu.host_register %kernel_ret1_cast : memref<*xf32> | ||
gpu.host_register %kernel_ret2_cast : memref<*xi32> | ||
gpu.launch_func @kernels::@vector_bitcast blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%kernel_ret0 : memref<3xi64>, %kernel_ret1 : memref<6xf32>, %kernel_ret2 : memref<6xi32>) | ||
|
||
call @printMemrefI64(%kernel_ret0_cast) : (memref<*xi64>) -> () | ||
call @printMemrefF32(%kernel_ret1_cast) : (memref<*xf32>) -> () | ||
call @printMemrefI32(%kernel_ret2_cast) : (memref<*xi32>) -> () | ||
|
||
func.return | ||
} | ||
func.func private @printMemrefI64(%tpr : memref<*xi64>) | ||
func.func private @printMemrefF32(%ptr : memref<*xf32>) | ||
func.func private @printMemrefI32(%ptr : memref<*xi32>) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
module attributes {gpu.container_module} { | ||
gpu.module @kernels { | ||
gpu.func @vector_compressstore(%base0 : memref<8xi32>, %base1 : memref<4x4xi32>) kernel { | ||
|
||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
%c3 = arith.constant 3 : index | ||
|
||
// case 0 | ||
%mask0 = arith.constant dense<[1, 0, 1]> : vector<3xi1> | ||
%value0 = arith.constant dense<[100, 101, 102]> : vector<3xi32> | ||
|
||
vector.compressstore %base0[%c0], %mask0, %value0 : memref<8xi32>, vector<3xi1>, vector<3xi32> | ||
|
||
// case 1 | ||
%base1_casted = memref.cast %base1 : memref<4x4xi32> to memref<?x?xi32> | ||
%mask1 = arith.constant dense<[1, 0, 1, 1, 1, 1, 0, 0]> : vector<8xi1> | ||
%value1 = arith.constant dense<[500, 501, 502, 503, 504, 505, 506, 507]> : vector<8xi32> | ||
|
||
vector.compressstore %base1_casted[%c3, %c1], %mask1, %value1 | ||
: memref<?x?xi32>, vector<8xi1>, vector<8xi32> | ||
|
||
gpu.return | ||
} | ||
} | ||
|
||
memref.global "private" @gv0 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]> | ||
|
||
memref.global "private" @gv1 : memref<4x4xi32> = dense<[[0, 1, 2, 3], | ||
[4, 5, 6, 7], | ||
[8, 9, 10, 11], | ||
[12, 13, 14, 15]]> | ||
|
||
func.func @main() { | ||
%A = memref.get_global @gv0 : memref<8xi32> | ||
%B = memref.get_global @gv1 : memref<4x4xi32> | ||
%A_cast = memref.cast %A : memref<8xi32> to memref<*xi32> | ||
%B_cast = memref.cast %B : memref<4x4xi32> to memref<*xi32> | ||
%c1 = arith.constant 1 : index | ||
gpu.host_register %A_cast : memref<*xi32> | ||
gpu.host_register %B_cast : memref<*xi32> | ||
gpu.launch_func @kernels::@vector_compressstore blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A : memref<8xi32>, %B : memref<4x4xi32>) | ||
|
||
call @printMemrefI32(%A_cast) : (memref<*xi32>) -> () | ||
call @printMemrefI32(%B_cast) : (memref<*xi32>) -> () | ||
|
||
func.return | ||
} | ||
func.func private @printMemrefI32(%ptr : memref<*xi32>) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
module attributes {gpu.container_module} { | ||
gpu.module @kernels { | ||
gpu.func @vector_constant_mask(%result: memref<12xi1>) kernel { | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
%c3 = arith.constant 3 : index | ||
%c4 = arith.constant 4 : index | ||
%mask0_vec = vector.constant_mask [3, 2] : vector<4x3xi1> | ||
|
||
%mask0_shape_casted = vector.shape_cast %mask0_vec : vector<4x3xi1> to vector<12xi1> | ||
|
||
vector.store %mask0_shape_casted, %result[%c0] : memref<12xi1>, vector<12xi1> | ||
gpu.return | ||
} | ||
} | ||
|
||
func.func @main() { | ||
%mask_created = memref.alloc() : memref<12xi1> | ||
%mask_created_cast = memref.cast %mask_created : memref<12xi1> to memref<*xi1> | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
gpu.host_register %mask_created_cast : memref<*xi1> | ||
gpu.launch_func @kernels::@vector_constant_mask blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%mask_created : memref<12xi1>) | ||
%mask_created_vec = vector.load %mask_created[%c0] : memref<12xi1>, vector<12xi1> | ||
%mask_created_vec_reshape = vector.shape_cast %mask_created_vec : vector<12xi1> to vector<4x3xi1> | ||
vector.print %mask_created_vec_reshape : vector<4x3xi1> | ||
|
||
func.return | ||
} | ||
func.func private @printMemrefI32(%ptr : memref<*xi1>) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#map0 = affine_map<(i, j, k) -> (i, j)> | ||
#map1 = affine_map<(i, j, k) -> (j, k)> | ||
#map2 = affine_map<(i, j, k) -> (i, k)> | ||
|
||
module attributes {gpu.container_module} { | ||
gpu.module @kernel { | ||
gpu.func @vector_contract() kernel { | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
%c3 = arith.constant 3 : index | ||
%v0 = arith.constant dense<[[1., 2., 3., 4.], | ||
[5., 6., 7., 8.], | ||
[9., 10., 11., 12.]]> : vector<3x4xf32> | ||
%v1 = arith.constant dense<[[1., 2., 3.], | ||
[4., 5., 6.], | ||
[7., 8., 9.], | ||
[10., 11., 12.]]> : vector<4x3xf32> | ||
%v2 = arith.constant dense<[[0., 0., 0.], | ||
[0., 0., 0.], | ||
[0., 0., 0.]]> : vector<3x3xf32> | ||
%v3 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} | ||
%v0, %v1, %v2 : vector<3x4xf32>, vector<4x3xf32> into vector<3x3xf32> | ||
// vector.store %v3, %result[] : memref<vector<3x3xf32>>, vector<3x3xf32> | ||
gpu.return | ||
} | ||
} | ||
|
||
func.func @main() { | ||
%c1 = arith.constant 1 : index | ||
gpu.launch_func @kernel::@vector_contract blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args() | ||
func.return | ||
} | ||
func.func private @printMemrefF32(%ptr : memref<*xvector<3x3xf32>>) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
module attributes {gpu.container_module} { | ||
gpu.module @kernels { | ||
gpu.func @vector_create_mask(%result: memref<3xi1>) kernel { | ||
%c0 = arith.constant 0 : index | ||
%cons2 = arith.constant 2 : index | ||
%mask0 = vector.create_mask %cons2 : vector<3xi1> | ||
vector.store %mask0, %result[%c0] : memref<3xi1>, vector<3xi1> | ||
gpu.return | ||
} | ||
} | ||
func.func @main() { | ||
%result = memref.alloc() : memref<3xi1> | ||
%result_cast = memref.cast %result : memref<3xi1> to memref<*xi1> | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
gpu.host_register %result_cast : memref<*xi1> | ||
gpu.launch_func @kernels::@vector_create_mask blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result : memref<3xi1>) | ||
%result_v = vector.load %result[%c0] : memref<3xi1>, vector<3xi1> | ||
vector.print %result_v : vector<3xi1> | ||
|
||
func.return | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
module attributes {gpu.container_module} { | ||
gpu.module @kernels { | ||
gpu.func @vector_extract() kernel { | ||
|
||
%base = arith.constant dense<[[0, 1, 2], | ||
[10, 11, 12], | ||
[20, 21, 22]]> : vector<3x3xi32> | ||
|
||
%c0 = vector.extract %base[1, 1] : i32 from vector<3x3xi32> | ||
gpu.printf "%d\n" %c0 : i32 | ||
|
||
%w1 = vector.extract %base[1] : vector<3xi32> from vector<3x3xi32> | ||
%w1_0 = vector.extract %w1[0] : i32 from vector<3xi32> | ||
%w1_1 = vector.extract %w1[1] : i32 from vector<3xi32> | ||
%w1_2 = vector.extract %w1[2] : i32 from vector<3xi32> | ||
gpu.printf "( %d, %d, %d )\n" %w1_0, %w1_1, %w1_2 : i32, i32, i32 | ||
gpu.return | ||
} | ||
} | ||
|
||
func.func @main() { | ||
%c1 = arith.constant 1 : index | ||
gpu.launch_func @kernels::@vector_extract blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args() | ||
func.return | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
module attributes {gpu.container_module} { | ||
gpu.module @kernels { | ||
memref.global "private" @gv : memref<4x4xf32> = dense<[[0. , 1. , 2. , 3. ], | ||
[10., 11., 12., 13.], | ||
[20., 21., 22., 23.], | ||
[30., 31., 32., 33.]]> | ||
gpu.func @vector_fma(%result: memref<4xf32>) kernel { | ||
%mem = memref.get_global @gv : memref<4x4xf32> | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
%c2 = arith.constant 2 : index | ||
%load_vec1 = vector.load %mem[%c0, %c0] : memref<4x4xf32>, vector<4xf32> | ||
%load_vec2 = vector.load %mem[%c1, %c0] : memref<4x4xf32>, vector<4xf32> | ||
%load_vec3 = vector.load %mem[%c2, %c0] : memref<4x4xf32>, vector<4xf32> | ||
%res = vector.fma %load_vec1, %load_vec2, %load_vec3 : vector<4xf32> | ||
vector.store %res, %result[%c0] : memref<4xf32>, vector<4xf32> | ||
gpu.return | ||
} | ||
} | ||
|
||
func.func @main() { | ||
%result = memref.alloc() : memref<4xf32> | ||
%result_cast = memref.cast %result : memref<4xf32> to memref<*xf32> | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
gpu.host_register %result_cast : memref<*xf32> | ||
gpu.launch_func @kernels::@vector_fma blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result : memref<4xf32>) | ||
%result_v = vector.load %result[%c0] : memref<4xf32>, vector<4xf32> | ||
vector.print %result_v : vector<4xf32> | ||
func.return | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
module attributes {gpu.container_module} { | ||
gpu.module @kernels { | ||
memref.global "private" @gv0 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]> | ||
|
||
memref.global "private" @gv1 : memref<4x4xi32> = dense<[[0, 1, 2, 3], | ||
[4, 5, 6, 7], | ||
[8, 9, 10, 11], | ||
[12, 13, 14, 15]]> | ||
|
||
memref.global "private" @gv2 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]> | ||
gpu.func @vector_gather(%result0: memref<4xi32>, %result1: memref<4xi32>, %result2: memref<4xi32>, %result3: memref<4xi32>) kernel { | ||
|
||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
%c2 = arith.constant 2 : index | ||
%c3 = arith.constant 3 : index | ||
|
||
%base0 = memref.get_global @gv0 : memref<8xi32> | ||
%base1 = memref.get_global @gv1 : memref<4x4xi32> | ||
%base2 = memref.get_global @gv2 : memref<8xi32> | ||
|
||
%pass_thru_4 = arith.constant dense<[2330, 2331, 2332, 2333]> : vector<4xi32> | ||
%pass_thru_8 = arith.constant dense<[2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337]> : vector<8xi32> | ||
%pass_thru_2x2 = arith.constant dense<114> : vector<2x2xi32> | ||
|
||
// normal | ||
%mask0 = arith.constant dense<1> : vector<4xi1> | ||
%index0 = arith.constant dense<[3, 4, 2, 1]> : vector<4xi32> | ||
%v0 = vector.gather %base0[%c0][%index0], %mask0, %pass_thru_4 | ||
: memref<8xi32>, vector<4xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32> | ||
vector.store %v0, %result0[%c0] : memref<4xi32>, vector<4xi32> | ||
|
||
// with mask | ||
%mask1 = arith.constant dense<[1, 0, 1, 0]> : vector<4xi1> | ||
%index1 = arith.constant dense<[3, 4, 2, 1]> : vector<4xi32> | ||
%v1 = vector.gather %base0[%c0][%index1], %mask1, %pass_thru_4 | ||
: memref<8xi32>, vector<4xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32> | ||
vector.store %v1, %result1[%c0] : memref<4xi32>, vector<4xi32> | ||
|
||
%mask2 = arith.constant dense<1> : vector<2x2xi1> | ||
%index2 = arith.constant dense<[[1, 0], [3, 2]]> : vector<2x2xi32> | ||
%v2 = vector.gather %base1[%c1, %c1][%index2], %mask2, %pass_thru_2x2 | ||
: memref<4x4xi32>, vector<2x2xi32>, vector<2x2xi1>, vector<2x2xi32> into vector<2x2xi32> | ||
%v2_shape_casted = vector.shape_cast %v2 : vector<2x2xi32> to vector<4xi32> | ||
vector.store %v2_shape_casted, %result2[%c0] : memref<4xi32>, vector<4xi32> | ||
|
||
%mask3 = arith.constant dense<1> : vector<2x2xi1> | ||
%index3 = arith.constant dense<[[-1, -8], [5, 13]]> : vector<2x2xi32> | ||
%v3 = vector.gather %base1[%c1, %c1][%index3], %mask3, %pass_thru_2x2 | ||
: memref<4x4xi32>, vector<2x2xi32>, vector<2x2xi1>, vector<2x2xi32> into vector<2x2xi32> | ||
|
||
// ( ( 4, 0), ( 10, 0 ) ). | ||
// On GPU, if indices are out-of-bound, the elements will be 0, which is different | ||
// from the CPU case. | ||
%v3_shape_casted = vector.shape_cast %v3 : vector<2x2xi32> to vector<4xi32> | ||
vector.store %v3_shape_casted, %result3[%c0] : memref<4xi32>, vector<4xi32> | ||
|
||
gpu.return | ||
} | ||
} | ||
|
||
func.func @main() { | ||
%result0 = memref.alloc() : memref<4xi32> | ||
%result1 = memref.alloc() : memref<4xi32> | ||
%result2 = memref.alloc() : memref<4xi32> | ||
%result3 = memref.alloc() : memref<4xi32> | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
|
||
// register host memory | ||
%result0_cast = memref.cast %result0 : memref<4xi32> to memref<*xi32> | ||
%result1_cast = memref.cast %result1 : memref<4xi32> to memref<*xi32> | ||
%result2_cast = memref.cast %result2 : memref<4xi32> to memref<*xi32> | ||
%result3_cast = memref.cast %result3 : memref<4xi32> to memref<*xi32> | ||
|
||
gpu.host_register %result0_cast : memref<*xi32> | ||
gpu.host_register %result1_cast : memref<*xi32> | ||
gpu.host_register %result2_cast : memref<*xi32> | ||
gpu.host_register %result3_cast : memref<*xi32> | ||
|
||
gpu.launch_func @kernels::@vector_gather blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result0 : memref<4xi32>, %result1 : memref<4xi32>, %result2 : memref<4xi32>, %result3 : memref<4xi32>) | ||
|
||
%result0_v = vector.load %result0[%c0] : memref<4xi32>, vector<4xi32> | ||
vector.print %result0_v : vector<4xi32> | ||
|
||
%result1_v = vector.load %result1[%c0] : memref<4xi32>, vector<4xi32> | ||
vector.print %result1_v : vector<4xi32> | ||
|
||
%result2_v = vector.load %result2[%c0] : memref<4xi32>, vector<4xi32> | ||
%result2_v_reshape = vector.shape_cast %result2_v : vector<4xi32> to vector<2x2xi32> | ||
vector.print %result2_v_reshape : vector<2x2xi32> | ||
|
||
%result3_v = vector.load %result3[%c0] : memref<4xi32>, vector<4xi32> | ||
%result3_v_reshape = vector.shape_cast %result3_v : vector<4xi32> to vector<2x2xi32> | ||
vector.print %result3_v_reshape : vector<2x2xi32> | ||
|
||
func.return | ||
} | ||
} |
Oops, something went wrong.