Merge pull request #4 from FloatingcloudKnight/revert-3-vector

Revert "[midend/lib/Conversion/ConvVectorization] Add cond2dnhwcfhwc pass and poolingnhwcmax pass"
buddy-compiler · Oct 23, 2024 · 06fa710 · 06fa710
2 parents ce0a6d9 + 191658d
commit 06fa710
Show file tree

Hide file tree

Showing 32 changed files with 93 additions and 4,017 deletions.
diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir
@@ -14,24 +14,22 @@
 // RUN: | FileCheck %s
 
 // Using `8` as the vector size.
-#map = affine_map<(d0) -> (d0 ceildiv 16)>
+#map = affine_map<(d0) -> (d0 floordiv 8)>
 #map0 = affine_map<(d0, d1, d2, d3) -> (d2)>
 #map1 = affine_map<(d0, d1) -> (d0 + d1)>
-#map2 = affine_map<(d0, d1) -> (d0 + d1 * 16)>
-#map3 = affine_map<(d0) -> (d0 * 16)>
+#map2 = affine_map<(d0, d1) -> (d0 + d1 * 8)>
+#map3 = affine_map<(d0) -> (d0 * 8)>
 
 module {
   func.func private @printMemrefF32(memref<*xf32>)
   func.func private @rtclock() -> f64
 
   func.func @conv_2d_nhwc_fhwc(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
+    %f0 = arith.constant 0. : f32
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
     %c3 = arith.constant 3 : index
-    %c32 = arith.constant 16 : index
-    %f0 = arith.constant 0.000000e+00 : f32
-    %0 = vector.splat %f0 : vector<16xf32>
     %n = memref.dim %arg0, %c0 : memref<?x?x?x?xf32>
     %h_i = memref.dim %arg0, %c1 : memref<?x?x?x?xf32>
     %w_i = memref.dim %arg0, %c2 : memref<?x?x?x?xf32>
@@ -47,53 +45,22 @@ module {
       affine.for %idx_f = %c0 to %f {
         affine.for %idx_c = %c0 to %c {
           affine.for %idx_h_o = %c0 to %h_o {
-            affine.for %idx_w_o = %c0 to #map(%w_o) {         
-              %1 = arith.muli %idx_w_o, %c32 : index
-              %2 = arith.subi %w_o, %1 : index
-              %3 = arith.cmpi sge, %2, %c32 : index
-              scf.if %3 {
-                // %arg2[%n, %h_o, %w_o*16, %f]
-                %output_vec = vector.transfer_read %arg2[%idx_n, %idx_h_o, %1, %idx_f], %f0
-                      { permutation_map = #map0 } : memref<?x?x?x?xf32>, vector<16xf32>
-                %5 = affine.for %idx_h_k = %c0 to %h_k iter_args(%arg8 = %output_vec) -> (vector<16xf32>) {        // %h_k
-                  %6 = affine.for %idx_w_k = %c0 to %w_k iter_args(%arg10 = %arg8) -> (vector<16xf32>) {           // %w_k
-                    // %arg1[%f, %h_k, %w_k, %c]
-                    %kernel_ele = memref.load %arg1[%idx_f, %idx_h_k, %idx_w_k, %idx_c] : memref<?x?x?x?xf32>
-                    %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<16xf32>
-                    %in_iter_h = affine.apply #map1 (%idx_h_k, %idx_h_o)
-                    %in_iter_w = affine.apply #map2 (%idx_w_k, %idx_w_o)
-                    // %arg0[%n, %h_k+%h_o, %w_k+%w_o*16, %c]
-                    %input_vec = vector.transfer_read %arg0[%idx_n, %in_iter_h, %in_iter_w, %idx_c], %f0
-                      { permutation_map = #map0 } : memref<?x?x?x?xf32>, vector<16xf32>
-                    %res_vec = vector.fma %kernel_vec, %input_vec, %arg10 : vector<16xf32>
-                    affine.yield %res_vec : vector<16xf32>
-                  }
-                  affine.yield %6 : vector<16xf32>
-                }
-                 vector.transfer_write %5, %arg2[%idx_n, %idx_h_o, %1, %idx_f]
-                  { permutation_map = #map0 } : vector<16xf32>, memref<?x?x?x?xf32>
-              } else {
-                %9 = vector.create_mask %2 : vector<16xi1>
-                // %arg2[%n, %h_o, %w_o*16, %f]
-                %output_vec = vector.transfer_read %arg2[%idx_n, %idx_h_o, %1, %idx_f], %f0, %9
-                      { permutation_map = #map0 } : memref<?x?x?x?xf32>, vector<16xf32>
-                %5 = affine.for %idx_h_k = %c0 to %h_k iter_args(%arg8 = %output_vec) -> (vector<16xf32>) {        // %h_k
-                  %6 = affine.for %idx_w_k = %c0 to %w_k iter_args(%arg10 = %arg8) -> (vector<16xf32>) {           // %w_k
-                    // %arg1[%f, %h_k, %w_k, %c]
-                    %kernel_ele = memref.load %arg1[%idx_f, %idx_h_k, %idx_w_k, %idx_c] : memref<?x?x?x?xf32>
-                    %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<16xf32>
-                    %in_iter_h = affine.apply #map1 (%idx_h_k, %idx_h_o)
-                    %in_iter_w = affine.apply #map2 (%idx_w_k, %idx_w_o)
-                    // %arg0[%n, %h_k+%h_o, %w_k+%w_o*16, %c]
-                    %input_vec = vector.transfer_read %arg0[%idx_n, %in_iter_h, %in_iter_w, %idx_c], %f0, %9
-                      { permutation_map = #map0 } : memref<?x?x?x?xf32>, vector<16xf32>
-                    %res_vec = vector.fma %kernel_vec, %input_vec, %arg10 : vector<16xf32>
-                    affine.yield %res_vec : vector<16xf32>
-                  }
-                  affine.yield %6 : vector<16xf32>
+            affine.for %idx_h_k = %c0 to %h_k {
+              affine.for %idx_w_k = %c0 to %w_k {
+                affine.for %idx_w_o = %c0 to #map(%w_o) {
+                  %kernel_ele = memref.load %arg1[%idx_f, %idx_h_k, %idx_w_k, %idx_c] : memref<?x?x?x?xf32>
+                  %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<8xf32>
+                  %in_iter_h = affine.apply #map1 (%idx_h_k, %idx_h_o)
+                  %in_iter_w = affine.apply #map2 (%idx_w_k, %idx_w_o)
+                  %out_iter_w = affine.apply #map3 (%idx_w_o)
+                  %input_vec = vector.transfer_read %arg0[%idx_n, %in_iter_h, %in_iter_w, %idx_c], %f0
+                    { permutation_map = #map0 } : memref<?x?x?x?xf32>, vector<8xf32>
+                  %output_vec = vector.transfer_read %arg2[%idx_n, %idx_h_o, %out_iter_w, %idx_f], %f0
+                    { permutation_map = #map0 } : memref<?x?x?x?xf32>, vector<8xf32>
+                  %res_vec = vector.fma %kernel_vec, %input_vec, %output_vec : vector<8xf32>
+                  vector.transfer_write %res_vec, %arg2[%idx_n, %idx_h_o, %out_iter_w, %idx_f]
+                    { permutation_map = #map0 } : vector<8xf32>, memref<?x?x?x?xf32>
                 }
-                vector.transfer_write %5, %arg2[%idx_n, %idx_h_o, %1, %idx_f], %9
-                { permutation_map = #map0 } : vector<16xf32>, memref<?x?x?x?xf32>
               }
             }
           }
@@ -140,8 +107,8 @@ module {
     // %v1 = call @alloc_f32(%c16, %c5, %c5, %c6, %f3) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
     // %v2 = call @alloc_f32(%c1, %c8, %c8, %c16, %f0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
 
-    %v0 = call @alloc_f32(%c1, %c28, %c28, %c5, %f2) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
-    %v1 = call @alloc_f32(%c6, %c5, %c5, %c5, %f3) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    %v0 = call @alloc_f32(%c1, %c28, %c28, %c1, %f2) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    %v1 = call @alloc_f32(%c6, %c5, %c5, %c1, %f3) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
     %v2 = call @alloc_f32(%c1, %c24, %c24, %c6, %f0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
 
     %t_start = call @rtclock() : () -> f64
@@ -154,7 +121,7 @@ module {
     // CHECK: [
     // CHECK: [
     // CHECK: [
-    // CHECK: [750{{(, 750)*}}],
+    // CHECK: [150{{(, 150)*}}],
     %print_v2 = memref.cast %v2 : memref<?x?x?x?xf32> to memref<*xf32>
     call @printMemrefF32(%print_v2) : (memref<*xf32>) -> ()
 

diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc-vec.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc-vec.mlir
diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir
@@ -17,10 +17,9 @@ module {
   func.func private @printMemrefF32(memref<*xf32>)
   func.func private @rtclock() -> f64
 
-  func.func @conv_2d_nhwc_fhwc(%arg0: memref<1x12x12x6xf32>, %arg1: memref<16x5x5x6xf32>, %arg2: memref<1x8x8x16xf32>) {
-    linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-            ins (%arg0, %arg1: memref<1x12x12x6xf32>, memref<16x5x5x6xf32>)
-            outs (%arg2: memref<1x8x8x16xf32>)
+  func.func @conv_2d_nhwc_fhwc(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
+    linalg.conv_2d_nhwc_fhwc ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
+                             outs (%arg2: memref<?x?x?x?xf32>)
     return
   }
 
@@ -55,17 +54,17 @@ module {
     %c16 = arith.constant 16 : index
     %c24 = arith.constant 24 : index
     %c28 = arith.constant 28 : index
-
-    %v0 = call @alloc_f32(%c1, %c12, %c12, %c6, %f2) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
-    %v1 = call @alloc_f32(%c16, %c5, %c5, %c6, %f3) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
-    %v2 = call @alloc_f32(%c1, %c8, %c8, %c16, %f0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
 
-    %a = memref.cast %v0 : memref<?x?x?x?xf32> to memref<1x12x12x6xf32>
-    %b = memref.cast %v1 : memref<?x?x?x?xf32> to memref<16x5x5x6xf32>
-    %c = memref.cast %v2 : memref<?x?x?x?xf32> to memref<1x8x8x16xf32>
+    // %v0 = call @alloc_f32(%c1, %c12, %c12, %c6, %f2) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    // %v1 = call @alloc_f32(%c16, %c5, %c5, %c6, %f3) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    // %v2 = call @alloc_f32(%c1, %c8, %c8, %c16, %f0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+
+    %v0 = call @alloc_f32(%c1, %c28, %c28, %c1, %f2) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    %v1 = call @alloc_f32(%c6, %c5, %c5, %c1, %f3) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    %v2 = call @alloc_f32(%c1, %c24, %c24, %c6, %f0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
 
     %t_start = call @rtclock() : () -> f64
-    call @conv_2d_nhwc_fhwc(%a, %b, %c) : (memref<1x12x12x6xf32>, memref<16x5x5x6xf32>, memref<1x8x8x16xf32>) -> ()
+    call @conv_2d_nhwc_fhwc(%v0, %v1, %v2) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
     %t_end = call @rtclock() : () -> f64
 
     // All the elements of the MemRef are the same,
@@ -74,8 +73,8 @@ module {
     // CHECK: [
     // CHECK: [
     // CHECK: [
-    // CHECK: [900{{(, 900)*}}],
-    %print_v2 = memref.cast %c : memref<1x8x8x16xf32> to memref<*xf32>
+    // CHECK: [150{{(, 150)*}}],
+    %print_v2 = memref.cast %v2 : memref<?x?x?x?xf32> to memref<*xf32>
     call @printMemrefF32(%print_v2) : (memref<*xf32>) -> ()
 
     %time = arith.subf %t_end, %t_start : f64

diff --git a/examples/BuddyConvolution/makefile b/examples/BuddyConvolution/makefile
@@ -125,34 +125,3 @@ conv2d-nhwc-fhwc-opt-aot:
 		-L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
 		-o a.out
 	@LD_LIBRARY_PATH=${MLIR_LIB} ./a.out
-
-conv2d-nhwc-fhwc-vec-run:
-	@${BUDDY_OPT} ./conv2d-nhwc-fhwc-vec.mlir \
-		-convert-vector-to-scf \
-		-lower-affine \
-		-arith-bufferize \
-		-convert-scf-to-cf \
-		-convert-vector-to-llvm \
-		-convert-arith-to-llvm \
-		-finalize-memref-to-llvm \
-		-convert-func-to-llvm \
-		-reconcile-unrealized-casts | \
-	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
-		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
-
-conv2d-nhwc-fhwc-vec-aot:
-	@${BUDDY_OPT} ./conv2d-nhwc-fhwc-vec.mlir \
-		-convert-vector-to-scf \
-		-lower-affine \
-		-arith-bufferize \
-		-convert-scf-to-cf \
-		-convert-vector-to-llvm \
-		-convert-arith-to-llvm \
-		-finalize-memref-to-llvm \
-		-convert-func-to-llvm \
-		-reconcile-unrealized-casts | \
-	${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
-	${CLANG} log.ll -O3 \
-		-L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
-		-o a.out
-	@LD_LIBRARY_PATH=${MLIR_LIB} ./a.out
diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
@@ -27,8 +27,6 @@ add_custom_command(
             -linalg-bufferize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
-            -conv2d-nhwc-fhwc-vectorization
-            -pooling-nhwc-max-vectorization
             -lower-affine
             -func-bufferize-dynamic-offset
             -arith-bufferize
@@ -53,6 +51,7 @@ add_custom_command(
   VERBATIM)
 
 add_library(LENET STATIC subgraph0.o forward.o)
+
 SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
 
 add_executable(buddy-lenet-run buddy-lenet-main.cpp)