Skip to content

Commit

Permalink
[examples] Add several examples of vector dialect on GPU (buddy-com…
Browse files Browse the repository at this point in the history
…piler#442)

* feat: Add store, load, and bitcast

* feat: Add constant_mask

* feat: Add constant-mask, contract, create-mask, extract, fma, gather, splat

* feat: Add insert, outerproduct and transpose

* feat: Add reduction

* feat: Add shape-cast and type-cast

* feat: refine makefile.

* feat: exclude MLIRVectorGPU from lit configuration

* fix: 1. add missing newlines at the end of multiple MLIR files. 2. refine several binarys' path in the makefile.
  • Loading branch information
xTayEx authored and asdf1113 committed Mar 3, 2025
1 parent ce3b1b8 commit f70aeab
Show file tree
Hide file tree
Showing 18 changed files with 1,691 additions and 0 deletions.
1,024 changes: 1,024 additions & 0 deletions examples/MLIRVectorGPU/makefile

Large diffs are not rendered by default.

45 changes: 45 additions & 0 deletions examples/MLIRVectorGPU/vector-bitcast.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @vector_bitcast(%ret0: memref<3xi64>, %ret1: memref<6xf32>, %ret2: memref<6xi32>) kernel {
%c0 = arith.constant 0 : index
%v0 = arith.constant dense<[10, 20, 56, 90, 12, 90]> : vector<6xi32>
%v1 = vector.bitcast %v0 : vector<6xi32> to vector<3xi64>
vector.store %v1, %ret0[%c0] : memref<3xi64>, vector<3xi64>

%v2 = vector.bitcast %v0 : vector<6xi32> to vector<6xf32>
vector.store %v2, %ret1[%c0] : memref<6xf32>, vector<6xf32>

%v3 = vector.bitcast %v2 : vector<6xf32> to vector<6xi32>
vector.store %v3, %ret2[%c0] : memref<6xi32>, vector<6xi32>

gpu.return
}
}

func.func @main() {
%c1 = arith.constant 1 : index
%kernel_ret0 = memref.alloc() : memref<3xi64>
%kernel_ret0_cast = memref.cast %kernel_ret0 : memref<3xi64> to memref<*xi64>

%kernel_ret1 = memref.alloc() : memref<6xf32>
%kernel_ret1_cast = memref.cast %kernel_ret1 : memref<6xf32> to memref<*xf32>

%kernel_ret2 = memref.alloc() : memref<6xi32>
%kernel_ret2_cast = memref.cast %kernel_ret2 : memref<6xi32> to memref<*xi32>

gpu.host_register %kernel_ret0_cast : memref<*xi64>
gpu.host_register %kernel_ret1_cast : memref<*xf32>
gpu.host_register %kernel_ret2_cast : memref<*xi32>
gpu.launch_func @kernels::@vector_bitcast blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%kernel_ret0 : memref<3xi64>, %kernel_ret1 : memref<6xf32>, %kernel_ret2 : memref<6xi32>)

call @printMemrefI64(%kernel_ret0_cast) : (memref<*xi64>) -> ()
call @printMemrefF32(%kernel_ret1_cast) : (memref<*xf32>) -> ()
call @printMemrefI32(%kernel_ret2_cast) : (memref<*xi32>) -> ()

func.return
}
func.func private @printMemrefI64(%tpr : memref<*xi64>)
func.func private @printMemrefF32(%ptr : memref<*xf32>)
func.func private @printMemrefI32(%ptr : memref<*xi32>)

}
50 changes: 50 additions & 0 deletions examples/MLIRVectorGPU/vector-compressstore.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @vector_compressstore(%base0 : memref<8xi32>, %base1 : memref<4x4xi32>) kernel {

%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index

// case 0
%mask0 = arith.constant dense<[1, 0, 1]> : vector<3xi1>
%value0 = arith.constant dense<[100, 101, 102]> : vector<3xi32>

vector.compressstore %base0[%c0], %mask0, %value0 : memref<8xi32>, vector<3xi1>, vector<3xi32>

// case 1
%base1_casted = memref.cast %base1 : memref<4x4xi32> to memref<?x?xi32>
%mask1 = arith.constant dense<[1, 0, 1, 1, 1, 1, 0, 0]> : vector<8xi1>
%value1 = arith.constant dense<[500, 501, 502, 503, 504, 505, 506, 507]> : vector<8xi32>

vector.compressstore %base1_casted[%c3, %c1], %mask1, %value1
: memref<?x?xi32>, vector<8xi1>, vector<8xi32>

gpu.return
}
}

memref.global "private" @gv0 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]>

memref.global "private" @gv1 : memref<4x4xi32> = dense<[[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11],
[12, 13, 14, 15]]>

func.func @main() {
%A = memref.get_global @gv0 : memref<8xi32>
%B = memref.get_global @gv1 : memref<4x4xi32>
%A_cast = memref.cast %A : memref<8xi32> to memref<*xi32>
%B_cast = memref.cast %B : memref<4x4xi32> to memref<*xi32>
%c1 = arith.constant 1 : index
gpu.host_register %A_cast : memref<*xi32>
gpu.host_register %B_cast : memref<*xi32>
gpu.launch_func @kernels::@vector_compressstore blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A : memref<8xi32>, %B : memref<4x4xi32>)

call @printMemrefI32(%A_cast) : (memref<*xi32>) -> ()
call @printMemrefI32(%B_cast) : (memref<*xi32>) -> ()

func.return
}
func.func private @printMemrefI32(%ptr : memref<*xi32>)
}
31 changes: 31 additions & 0 deletions examples/MLIRVectorGPU/vector-constant-mask.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @vector_constant_mask(%result: memref<12xi1>) kernel {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%mask0_vec = vector.constant_mask [3, 2] : vector<4x3xi1>

%mask0_shape_casted = vector.shape_cast %mask0_vec : vector<4x3xi1> to vector<12xi1>

vector.store %mask0_shape_casted, %result[%c0] : memref<12xi1>, vector<12xi1>
gpu.return
}
}

func.func @main() {
%mask_created = memref.alloc() : memref<12xi1>
%mask_created_cast = memref.cast %mask_created : memref<12xi1> to memref<*xi1>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
gpu.host_register %mask_created_cast : memref<*xi1>
gpu.launch_func @kernels::@vector_constant_mask blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%mask_created : memref<12xi1>)
%mask_created_vec = vector.load %mask_created[%c0] : memref<12xi1>, vector<12xi1>
%mask_created_vec_reshape = vector.shape_cast %mask_created_vec : vector<12xi1> to vector<4x3xi1>
vector.print %mask_created_vec_reshape : vector<4x3xi1>

func.return
}
func.func private @printMemrefI32(%ptr : memref<*xi1>)
}
34 changes: 34 additions & 0 deletions examples/MLIRVectorGPU/vector-contract.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#map0 = affine_map<(i, j, k) -> (i, j)>
#map1 = affine_map<(i, j, k) -> (j, k)>
#map2 = affine_map<(i, j, k) -> (i, k)>

module attributes {gpu.container_module} {
gpu.module @kernel {
gpu.func @vector_contract() kernel {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%v0 = arith.constant dense<[[1., 2., 3., 4.],
[5., 6., 7., 8.],
[9., 10., 11., 12.]]> : vector<3x4xf32>
%v1 = arith.constant dense<[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.],
[10., 11., 12.]]> : vector<4x3xf32>
%v2 = arith.constant dense<[[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]]> : vector<3x3xf32>
%v3 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]}
%v0, %v1, %v2 : vector<3x4xf32>, vector<4x3xf32> into vector<3x3xf32>
// vector.store %v3, %result[] : memref<vector<3x3xf32>>, vector<3x3xf32>
gpu.return
}
}

func.func @main() {
%c1 = arith.constant 1 : index
gpu.launch_func @kernel::@vector_contract blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args()
func.return
}
func.func private @printMemrefF32(%ptr : memref<*xvector<3x3xf32>>)
}
23 changes: 23 additions & 0 deletions examples/MLIRVectorGPU/vector-create-mask.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @vector_create_mask(%result: memref<3xi1>) kernel {
%c0 = arith.constant 0 : index
%cons2 = arith.constant 2 : index
%mask0 = vector.create_mask %cons2 : vector<3xi1>
vector.store %mask0, %result[%c0] : memref<3xi1>, vector<3xi1>
gpu.return
}
}
func.func @main() {
%result = memref.alloc() : memref<3xi1>
%result_cast = memref.cast %result : memref<3xi1> to memref<*xi1>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
gpu.host_register %result_cast : memref<*xi1>
gpu.launch_func @kernels::@vector_create_mask blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result : memref<3xi1>)
%result_v = vector.load %result[%c0] : memref<3xi1>, vector<3xi1>
vector.print %result_v : vector<3xi1>

func.return
}
}
26 changes: 26 additions & 0 deletions examples/MLIRVectorGPU/vector-extract.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @vector_extract() kernel {

%base = arith.constant dense<[[0, 1, 2],
[10, 11, 12],
[20, 21, 22]]> : vector<3x3xi32>

%c0 = vector.extract %base[1, 1] : i32 from vector<3x3xi32>
gpu.printf "%d\n" %c0 : i32

%w1 = vector.extract %base[1] : vector<3xi32> from vector<3x3xi32>
%w1_0 = vector.extract %w1[0] : i32 from vector<3xi32>
%w1_1 = vector.extract %w1[1] : i32 from vector<3xi32>
%w1_2 = vector.extract %w1[2] : i32 from vector<3xi32>
gpu.printf "( %d, %d, %d )\n" %w1_0, %w1_1, %w1_2 : i32, i32, i32
gpu.return
}
}

func.func @main() {
%c1 = arith.constant 1 : index
gpu.launch_func @kernels::@vector_extract blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args()
func.return
}
}
32 changes: 32 additions & 0 deletions examples/MLIRVectorGPU/vector-fma.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module attributes {gpu.container_module} {
gpu.module @kernels {
memref.global "private" @gv : memref<4x4xf32> = dense<[[0. , 1. , 2. , 3. ],
[10., 11., 12., 13.],
[20., 21., 22., 23.],
[30., 31., 32., 33.]]>
gpu.func @vector_fma(%result: memref<4xf32>) kernel {
%mem = memref.get_global @gv : memref<4x4xf32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%load_vec1 = vector.load %mem[%c0, %c0] : memref<4x4xf32>, vector<4xf32>
%load_vec2 = vector.load %mem[%c1, %c0] : memref<4x4xf32>, vector<4xf32>
%load_vec3 = vector.load %mem[%c2, %c0] : memref<4x4xf32>, vector<4xf32>
%res = vector.fma %load_vec1, %load_vec2, %load_vec3 : vector<4xf32>
vector.store %res, %result[%c0] : memref<4xf32>, vector<4xf32>
gpu.return
}
}

func.func @main() {
%result = memref.alloc() : memref<4xf32>
%result_cast = memref.cast %result : memref<4xf32> to memref<*xf32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
gpu.host_register %result_cast : memref<*xf32>
gpu.launch_func @kernels::@vector_fma blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result : memref<4xf32>)
%result_v = vector.load %result[%c0] : memref<4xf32>, vector<4xf32>
vector.print %result_v : vector<4xf32>
func.return
}
}
99 changes: 99 additions & 0 deletions examples/MLIRVectorGPU/vector-gather.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
module attributes {gpu.container_module} {
gpu.module @kernels {
memref.global "private" @gv0 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]>

memref.global "private" @gv1 : memref<4x4xi32> = dense<[[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11],
[12, 13, 14, 15]]>

memref.global "private" @gv2 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]>
gpu.func @vector_gather(%result0: memref<4xi32>, %result1: memref<4xi32>, %result2: memref<4xi32>, %result3: memref<4xi32>) kernel {

%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index

%base0 = memref.get_global @gv0 : memref<8xi32>
%base1 = memref.get_global @gv1 : memref<4x4xi32>
%base2 = memref.get_global @gv2 : memref<8xi32>

%pass_thru_4 = arith.constant dense<[2330, 2331, 2332, 2333]> : vector<4xi32>
%pass_thru_8 = arith.constant dense<[2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337]> : vector<8xi32>
%pass_thru_2x2 = arith.constant dense<114> : vector<2x2xi32>

// normal
%mask0 = arith.constant dense<1> : vector<4xi1>
%index0 = arith.constant dense<[3, 4, 2, 1]> : vector<4xi32>
%v0 = vector.gather %base0[%c0][%index0], %mask0, %pass_thru_4
: memref<8xi32>, vector<4xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
vector.store %v0, %result0[%c0] : memref<4xi32>, vector<4xi32>

// with mask
%mask1 = arith.constant dense<[1, 0, 1, 0]> : vector<4xi1>
%index1 = arith.constant dense<[3, 4, 2, 1]> : vector<4xi32>
%v1 = vector.gather %base0[%c0][%index1], %mask1, %pass_thru_4
: memref<8xi32>, vector<4xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
vector.store %v1, %result1[%c0] : memref<4xi32>, vector<4xi32>

%mask2 = arith.constant dense<1> : vector<2x2xi1>
%index2 = arith.constant dense<[[1, 0], [3, 2]]> : vector<2x2xi32>
%v2 = vector.gather %base1[%c1, %c1][%index2], %mask2, %pass_thru_2x2
: memref<4x4xi32>, vector<2x2xi32>, vector<2x2xi1>, vector<2x2xi32> into vector<2x2xi32>
%v2_shape_casted = vector.shape_cast %v2 : vector<2x2xi32> to vector<4xi32>
vector.store %v2_shape_casted, %result2[%c0] : memref<4xi32>, vector<4xi32>

%mask3 = arith.constant dense<1> : vector<2x2xi1>
%index3 = arith.constant dense<[[-1, -8], [5, 13]]> : vector<2x2xi32>
%v3 = vector.gather %base1[%c1, %c1][%index3], %mask3, %pass_thru_2x2
: memref<4x4xi32>, vector<2x2xi32>, vector<2x2xi1>, vector<2x2xi32> into vector<2x2xi32>

// ( ( 4, 0), ( 10, 0 ) ).
// On GPU, if indices are out-of-bound, the elements will be 0, which is different
// from the CPU case.
%v3_shape_casted = vector.shape_cast %v3 : vector<2x2xi32> to vector<4xi32>
vector.store %v3_shape_casted, %result3[%c0] : memref<4xi32>, vector<4xi32>

gpu.return
}
}

func.func @main() {
%result0 = memref.alloc() : memref<4xi32>
%result1 = memref.alloc() : memref<4xi32>
%result2 = memref.alloc() : memref<4xi32>
%result3 = memref.alloc() : memref<4xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index

// register host memory
%result0_cast = memref.cast %result0 : memref<4xi32> to memref<*xi32>
%result1_cast = memref.cast %result1 : memref<4xi32> to memref<*xi32>
%result2_cast = memref.cast %result2 : memref<4xi32> to memref<*xi32>
%result3_cast = memref.cast %result3 : memref<4xi32> to memref<*xi32>

gpu.host_register %result0_cast : memref<*xi32>
gpu.host_register %result1_cast : memref<*xi32>
gpu.host_register %result2_cast : memref<*xi32>
gpu.host_register %result3_cast : memref<*xi32>

gpu.launch_func @kernels::@vector_gather blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result0 : memref<4xi32>, %result1 : memref<4xi32>, %result2 : memref<4xi32>, %result3 : memref<4xi32>)

%result0_v = vector.load %result0[%c0] : memref<4xi32>, vector<4xi32>
vector.print %result0_v : vector<4xi32>

%result1_v = vector.load %result1[%c0] : memref<4xi32>, vector<4xi32>
vector.print %result1_v : vector<4xi32>

%result2_v = vector.load %result2[%c0] : memref<4xi32>, vector<4xi32>
%result2_v_reshape = vector.shape_cast %result2_v : vector<4xi32> to vector<2x2xi32>
vector.print %result2_v_reshape : vector<2x2xi32>

%result3_v = vector.load %result3[%c0] : memref<4xi32>, vector<4xi32>
%result3_v_reshape = vector.shape_cast %result3_v : vector<4xi32> to vector<2x2xi32>
vector.print %result3_v_reshape : vector<2x2xi32>

func.return
}
}
Loading

0 comments on commit f70aeab

Please sign in to comment.