[examples] Add several examples of vector dialect on GPU (buddy-com…

…piler#442) * feat: Add store, load, and bitcast * feat: Add constant_mask * feat: Add constant-mask, contract, create-mask, extract, fma, gather, splat * feat: Add insert, outerproduct and transpose * feat: Add reduction * feat: Add shape-cast and type-cast * feat: refine makefile. * feat: exclude MLIRVectorGPU from lit configuration * fix: 1. add missing newlines at the end of multiple MLIR files. 2. refine several binarys' path in the makefile.
asdf1113 · Mar 3, 2025 · f70aeab · f70aeab
1 parent ce3b1b8
commit f70aeab
Show file tree

Hide file tree

Showing 18 changed files with 1,691 additions and 0 deletions.
diff --git a/examples/MLIRVectorGPU/makefile b/examples/MLIRVectorGPU/makefile
diff --git a/examples/MLIRVectorGPU/vector-bitcast.mlir b/examples/MLIRVectorGPU/vector-bitcast.mlir
@@ -0,0 +1,45 @@
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    gpu.func @vector_bitcast(%ret0: memref<3xi64>, %ret1: memref<6xf32>, %ret2: memref<6xi32>) kernel {
+      %c0 = arith.constant 0 : index
+      %v0 = arith.constant dense<[10, 20, 56, 90, 12, 90]> : vector<6xi32>
+      %v1 = vector.bitcast %v0 : vector<6xi32> to vector<3xi64>
+      vector.store %v1, %ret0[%c0] : memref<3xi64>, vector<3xi64>
+
+      %v2 = vector.bitcast %v0 : vector<6xi32> to vector<6xf32>
+      vector.store %v2, %ret1[%c0] : memref<6xf32>, vector<6xf32>
+
+      %v3 = vector.bitcast %v2 : vector<6xf32> to vector<6xi32>
+      vector.store %v3, %ret2[%c0] : memref<6xi32>, vector<6xi32>
+
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+    %kernel_ret0 = memref.alloc() : memref<3xi64>
+    %kernel_ret0_cast = memref.cast %kernel_ret0 : memref<3xi64> to memref<*xi64>
+
+    %kernel_ret1 = memref.alloc() : memref<6xf32>
+    %kernel_ret1_cast = memref.cast %kernel_ret1 : memref<6xf32> to memref<*xf32>
+
+    %kernel_ret2 = memref.alloc() : memref<6xi32>
+    %kernel_ret2_cast = memref.cast %kernel_ret2 : memref<6xi32> to memref<*xi32>
+
+    gpu.host_register %kernel_ret0_cast : memref<*xi64>
+    gpu.host_register %kernel_ret1_cast : memref<*xf32>
+    gpu.host_register %kernel_ret2_cast : memref<*xi32>
+    gpu.launch_func @kernels::@vector_bitcast blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%kernel_ret0 : memref<3xi64>, %kernel_ret1 : memref<6xf32>, %kernel_ret2 : memref<6xi32>)
+
+    call @printMemrefI64(%kernel_ret0_cast) : (memref<*xi64>) -> ()
+    call @printMemrefF32(%kernel_ret1_cast) : (memref<*xf32>) -> ()
+    call @printMemrefI32(%kernel_ret2_cast) : (memref<*xi32>) -> ()
+
+    func.return
+  }
+  func.func private @printMemrefI64(%tpr : memref<*xi64>)
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+  func.func private @printMemrefI32(%ptr : memref<*xi32>)
+
+}
diff --git a/examples/MLIRVectorGPU/vector-compressstore.mlir b/examples/MLIRVectorGPU/vector-compressstore.mlir
@@ -0,0 +1,50 @@
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    gpu.func @vector_compressstore(%base0 : memref<8xi32>, %base1 : memref<4x4xi32>) kernel {
+
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c3 = arith.constant 3 : index
+
+      // case 0
+      %mask0 = arith.constant dense<[1, 0, 1]> : vector<3xi1>
+      %value0 = arith.constant dense<[100, 101, 102]> : vector<3xi32>
+
+      vector.compressstore %base0[%c0], %mask0, %value0 : memref<8xi32>, vector<3xi1>, vector<3xi32>
+
+      // case 1
+      %base1_casted = memref.cast %base1 : memref<4x4xi32> to memref<?x?xi32>
+      %mask1 = arith.constant dense<[1, 0, 1, 1, 1, 1, 0, 0]> : vector<8xi1>
+      %value1 = arith.constant dense<[500, 501, 502, 503, 504, 505, 506, 507]> : vector<8xi32>
+
+      vector.compressstore %base1_casted[%c3, %c1], %mask1, %value1
+        : memref<?x?xi32>, vector<8xi1>, vector<8xi32>
+
+      gpu.return
+    }
+  } 
+
+  memref.global "private" @gv0 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]>
+
+  memref.global "private" @gv1 : memref<4x4xi32> = dense<[[0, 1, 2, 3],
+                                                          [4, 5, 6, 7],
+                                                          [8, 9, 10, 11],
+                                                          [12, 13, 14, 15]]>
+
+  func.func @main() {
+    %A = memref.get_global @gv0 : memref<8xi32>
+    %B = memref.get_global @gv1 : memref<4x4xi32>
+    %A_cast = memref.cast %A : memref<8xi32> to memref<*xi32>
+    %B_cast = memref.cast %B : memref<4x4xi32> to memref<*xi32>
+    %c1 = arith.constant 1 : index
+    gpu.host_register %A_cast : memref<*xi32>
+    gpu.host_register %B_cast : memref<*xi32>
+    gpu.launch_func @kernels::@vector_compressstore blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A : memref<8xi32>, %B : memref<4x4xi32>)
+
+    call @printMemrefI32(%A_cast) : (memref<*xi32>) -> ()
+    call @printMemrefI32(%B_cast) : (memref<*xi32>) -> ()
+
+    func.return
+  }
+  func.func private @printMemrefI32(%ptr : memref<*xi32>)
+}
diff --git a/examples/MLIRVectorGPU/vector-constant-mask.mlir b/examples/MLIRVectorGPU/vector-constant-mask.mlir
@@ -0,0 +1,31 @@
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    gpu.func @vector_constant_mask(%result: memref<12xi1>) kernel {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c3 = arith.constant 3 : index
+      %c4 = arith.constant 4 : index
+      %mask0_vec = vector.constant_mask [3, 2] : vector<4x3xi1>
+
+      %mask0_shape_casted = vector.shape_cast %mask0_vec : vector<4x3xi1> to vector<12xi1>
+
+      vector.store %mask0_shape_casted, %result[%c0] : memref<12xi1>, vector<12xi1>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %mask_created = memref.alloc() : memref<12xi1>
+    %mask_created_cast = memref.cast %mask_created : memref<12xi1> to memref<*xi1>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    gpu.host_register %mask_created_cast : memref<*xi1>
+    gpu.launch_func @kernels::@vector_constant_mask blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%mask_created : memref<12xi1>)
+    %mask_created_vec = vector.load %mask_created[%c0] : memref<12xi1>, vector<12xi1>
+    %mask_created_vec_reshape = vector.shape_cast %mask_created_vec : vector<12xi1> to vector<4x3xi1>
+    vector.print %mask_created_vec_reshape : vector<4x3xi1>
+
+    func.return
+  }
+  func.func private @printMemrefI32(%ptr : memref<*xi1>)
+}
diff --git a/examples/MLIRVectorGPU/vector-contract.mlir b/examples/MLIRVectorGPU/vector-contract.mlir
@@ -0,0 +1,34 @@
+#map0 = affine_map<(i, j, k) -> (i, j)>
+#map1 = affine_map<(i, j, k) -> (j, k)>
+#map2 = affine_map<(i, j, k) -> (i, k)>
+
+module attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @vector_contract() kernel {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c3 = arith.constant 3 : index
+      %v0 = arith.constant dense<[[1., 2., 3., 4.], 
+                                 [5., 6., 7., 8.], 
+                                 [9., 10., 11., 12.]]> : vector<3x4xf32>
+      %v1 = arith.constant dense<[[1., 2., 3.], 
+                                 [4., 5., 6.], 
+                                 [7., 8., 9.], 
+                                 [10., 11., 12.]]> : vector<4x3xf32>
+      %v2 = arith.constant dense<[[0., 0., 0.], 
+                                 [0., 0., 0.], 
+                                 [0., 0., 0.]]> : vector<3x3xf32>
+      %v3 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]}
+      %v0, %v1, %v2 : vector<3x4xf32>, vector<4x3xf32> into vector<3x3xf32>
+      // vector.store %v3, %result[] : memref<vector<3x3xf32>>, vector<3x3xf32>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+    gpu.launch_func @kernel::@vector_contract blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args()
+    func.return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xvector<3x3xf32>>)
+}
diff --git a/examples/MLIRVectorGPU/vector-create-mask.mlir b/examples/MLIRVectorGPU/vector-create-mask.mlir
@@ -0,0 +1,23 @@
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    gpu.func @vector_create_mask(%result: memref<3xi1>) kernel {
+      %c0 = arith.constant 0 : index
+      %cons2 = arith.constant 2 : index
+      %mask0 = vector.create_mask %cons2 : vector<3xi1>
+      vector.store %mask0, %result[%c0] : memref<3xi1>, vector<3xi1>
+      gpu.return
+    }
+  }
+  func.func @main() {
+    %result = memref.alloc() : memref<3xi1>
+    %result_cast = memref.cast %result : memref<3xi1> to memref<*xi1>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    gpu.host_register %result_cast : memref<*xi1>
+    gpu.launch_func @kernels::@vector_create_mask blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result : memref<3xi1>)
+    %result_v = vector.load %result[%c0] : memref<3xi1>, vector<3xi1>
+    vector.print %result_v : vector<3xi1>
+
+    func.return
+  }
+}
diff --git a/examples/MLIRVectorGPU/vector-extract.mlir b/examples/MLIRVectorGPU/vector-extract.mlir
@@ -0,0 +1,26 @@
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    gpu.func @vector_extract() kernel {
+
+      %base = arith.constant dense<[[0, 1, 2],
+                                    [10, 11, 12],
+                                    [20, 21, 22]]> : vector<3x3xi32>
+
+      %c0 = vector.extract %base[1, 1] : i32 from vector<3x3xi32>
+      gpu.printf "%d\n" %c0 : i32
+
+      %w1 = vector.extract %base[1] : vector<3xi32> from vector<3x3xi32>
+      %w1_0 = vector.extract %w1[0] : i32 from vector<3xi32>
+      %w1_1 = vector.extract %w1[1] : i32 from vector<3xi32>
+      %w1_2 = vector.extract %w1[2] : i32 from vector<3xi32>
+      gpu.printf "( %d, %d, %d )\n" %w1_0, %w1_1, %w1_2 : i32, i32, i32
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+    gpu.launch_func @kernels::@vector_extract blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args()
+    func.return
+  }
+}
diff --git a/examples/MLIRVectorGPU/vector-fma.mlir b/examples/MLIRVectorGPU/vector-fma.mlir
@@ -0,0 +1,32 @@
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    memref.global "private" @gv : memref<4x4xf32> = dense<[[0. , 1. , 2. , 3. ],
+                                                          [10., 11., 12., 13.],
+                                                          [20., 21., 22., 23.],
+                                                          [30., 31., 32., 33.]]>
+    gpu.func @vector_fma(%result: memref<4xf32>) kernel {
+      %mem = memref.get_global @gv : memref<4x4xf32>
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %load_vec1 = vector.load %mem[%c0, %c0] : memref<4x4xf32>, vector<4xf32>
+      %load_vec2 = vector.load %mem[%c1, %c0] : memref<4x4xf32>, vector<4xf32>
+      %load_vec3 = vector.load %mem[%c2, %c0] : memref<4x4xf32>, vector<4xf32>
+      %res = vector.fma %load_vec1, %load_vec2, %load_vec3 : vector<4xf32>
+      vector.store %res, %result[%c0] : memref<4xf32>, vector<4xf32>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %result = memref.alloc() : memref<4xf32>
+    %result_cast = memref.cast %result : memref<4xf32> to memref<*xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    gpu.host_register %result_cast : memref<*xf32>
+    gpu.launch_func @kernels::@vector_fma blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result : memref<4xf32>)
+    %result_v = vector.load %result[%c0] : memref<4xf32>, vector<4xf32>
+    vector.print %result_v : vector<4xf32>
+    func.return
+  }
+}
diff --git a/examples/MLIRVectorGPU/vector-gather.mlir b/examples/MLIRVectorGPU/vector-gather.mlir
@@ -0,0 +1,99 @@
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    memref.global "private" @gv0 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]>
+
+    memref.global "private" @gv1 : memref<4x4xi32> = dense<[[0, 1, 2, 3],
+                                                            [4, 5, 6, 7],
+                                                            [8, 9, 10, 11],
+                                                            [12, 13, 14, 15]]>
+
+    memref.global "private" @gv2 : memref<8xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7]>
+    gpu.func @vector_gather(%result0: memref<4xi32>, %result1: memref<4xi32>, %result2: memref<4xi32>, %result3: memref<4xi32>) kernel {
+
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %c3 = arith.constant 3 : index
+
+      %base0 = memref.get_global @gv0 : memref<8xi32>
+      %base1 = memref.get_global @gv1 : memref<4x4xi32>
+      %base2 = memref.get_global @gv2 : memref<8xi32>
+
+      %pass_thru_4 = arith.constant dense<[2330, 2331, 2332, 2333]> : vector<4xi32>
+      %pass_thru_8 = arith.constant dense<[2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337]> : vector<8xi32>
+      %pass_thru_2x2 = arith.constant dense<114> : vector<2x2xi32>
+
+      // normal
+      %mask0 = arith.constant dense<1> : vector<4xi1>
+      %index0 = arith.constant dense<[3, 4, 2, 1]> : vector<4xi32>
+      %v0 = vector.gather %base0[%c0][%index0], %mask0, %pass_thru_4
+        : memref<8xi32>, vector<4xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
+      vector.store %v0, %result0[%c0] : memref<4xi32>, vector<4xi32>
+
+      // with mask
+      %mask1 = arith.constant dense<[1, 0, 1, 0]> : vector<4xi1>
+      %index1 = arith.constant dense<[3, 4, 2, 1]> : vector<4xi32>
+      %v1 = vector.gather %base0[%c0][%index1], %mask1, %pass_thru_4
+        : memref<8xi32>, vector<4xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
+      vector.store %v1, %result1[%c0] : memref<4xi32>, vector<4xi32>
+
+      %mask2 = arith.constant dense<1> : vector<2x2xi1>
+      %index2 = arith.constant dense<[[1, 0], [3, 2]]> : vector<2x2xi32>
+      %v2 = vector.gather %base1[%c1, %c1][%index2], %mask2, %pass_thru_2x2
+        : memref<4x4xi32>, vector<2x2xi32>, vector<2x2xi1>, vector<2x2xi32> into vector<2x2xi32>
+      %v2_shape_casted = vector.shape_cast %v2 : vector<2x2xi32> to vector<4xi32>
+      vector.store %v2_shape_casted, %result2[%c0] : memref<4xi32>, vector<4xi32>
+
+      %mask3 = arith.constant dense<1> : vector<2x2xi1>
+      %index3 = arith.constant dense<[[-1, -8], [5, 13]]> : vector<2x2xi32>
+      %v3 = vector.gather %base1[%c1, %c1][%index3], %mask3, %pass_thru_2x2
+        : memref<4x4xi32>, vector<2x2xi32>, vector<2x2xi1>, vector<2x2xi32> into vector<2x2xi32>
+
+      // ( ( 4, 0), ( 10, 0 ) ).
+      // On GPU, if indices are out-of-bound, the elements will be 0, which is different
+      // from the CPU case.
+      %v3_shape_casted = vector.shape_cast %v3 : vector<2x2xi32> to vector<4xi32>
+      vector.store %v3_shape_casted, %result3[%c0] : memref<4xi32>, vector<4xi32>
+
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %result0 = memref.alloc() : memref<4xi32>
+    %result1 = memref.alloc() : memref<4xi32>
+    %result2 = memref.alloc() : memref<4xi32>
+    %result3 = memref.alloc() : memref<4xi32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+
+    // register host memory
+    %result0_cast = memref.cast %result0 : memref<4xi32> to memref<*xi32>
+    %result1_cast = memref.cast %result1 : memref<4xi32> to memref<*xi32>
+    %result2_cast = memref.cast %result2 : memref<4xi32> to memref<*xi32>
+    %result3_cast = memref.cast %result3 : memref<4xi32> to memref<*xi32>
+
+    gpu.host_register %result0_cast : memref<*xi32>
+    gpu.host_register %result1_cast : memref<*xi32>
+    gpu.host_register %result2_cast : memref<*xi32>
+    gpu.host_register %result3_cast : memref<*xi32>
+
+    gpu.launch_func @kernels::@vector_gather blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%result0 : memref<4xi32>, %result1 : memref<4xi32>, %result2 : memref<4xi32>, %result3 : memref<4xi32>)
+
+    %result0_v = vector.load %result0[%c0] : memref<4xi32>, vector<4xi32>
+    vector.print %result0_v : vector<4xi32>
+
+    %result1_v = vector.load %result1[%c0] : memref<4xi32>, vector<4xi32>
+    vector.print %result1_v : vector<4xi32>
+
+    %result2_v = vector.load %result2[%c0] : memref<4xi32>, vector<4xi32>
+    %result2_v_reshape = vector.shape_cast %result2_v : vector<4xi32> to vector<2x2xi32>
+    vector.print %result2_v_reshape : vector<2x2xi32>
+
+    %result3_v = vector.load %result3[%c0] : memref<4xi32>, vector<4xi32>
+    %result3_v_reshape = vector.shape_cast %result3_v : vector<4xi32> to vector<2x2xi32>
+    vector.print %result3_v_reshape : vector<2x2xi32>
+
+    func.return
+  }
+}