diff --git a/examples/batchMatMulOpt/.gitignore b/examples/batchMatMulOpt/.gitignore new file mode 100644 index 0000000000..80a243fa81 --- /dev/null +++ b/examples/batchMatMulOpt/.gitignore @@ -0,0 +1 @@ +log.* diff --git a/examples/batchMatMulOpt/linalg-batch-matmul-f32-1.mlir b/examples/batchMatMulOpt/linalg-batch-matmul-f32-1.mlir new file mode 100644 index 0000000000..42640a3f32 --- /dev/null +++ b/examples/batchMatMulOpt/linalg-batch-matmul-f32-1.mlir @@ -0,0 +1,51 @@ + +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +func.func @main(){ + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c576 = arith.constant 576 : index + %c1024 = arith.constant 1024 : index + %c1000 = arith.constant 1000 : index + %f2 = arith.constant 2.0 : f32 + %f3 = arith.constant 3.0 : f32 + %f0 = arith.constant 0.0 : f32 + + %a = memref.alloc() : memref<1x1x576xf32> + scf.for %arg0 = %c0 to %c1 step %c1 { + scf.for %arg1 = %c0 to %c1 step %c1 { + scf.for %arg2 = %c0 to %c576 step %c1 { + memref.store %f3, %a[%arg0, %arg1, %arg2] : memref<1x1x576xf32> + } + } + } + + %b = memref.alloc() : memref<1x576x1024xf32> + scf.for %arg0 = %c0 to %c1 step %c1 { + scf.for %arg1 = %c0 to %c576 step %c1 { + scf.for %arg2 = %c0 to %c1024 step %c1 { + memref.store %f2, %b[%arg0, %arg1, %arg2] : memref<1x576x1024xf32> + } + } + } + + %c = memref.alloc() : memref<1x1x1024xf32> + scf.for %arg0 = %c0 to %c1 step %c1 { + scf.for %arg1 = %c0 to %c1 step %c1 { + scf.for %arg2 = %c0 to %c1000 step %c1 { + memref.store %f0, %c[%arg0, %arg1, %arg2] : memref<1x1x1024xf32> + } + } + } + + linalg.batch_matmul + ins(%a, %b : memref<1x1x576xf32>, memref<1x576x1024xf32>) + outs(%c : memref<1x1x1024xf32>) + + + %printed_c = memref.cast %c : memref<1x1x1024xf32> to memref<*xf32> + call @printMemrefF32(%printed_c) : (memref<*xf32>) -> () + + return +} diff --git a/examples/batchMatMulOpt/linalg-batch-matmul-f32-2.mlir b/examples/batchMatMulOpt/linalg-batch-matmul-f32-2.mlir new file mode 100644 index 0000000000..9c58ca7ff2 --- /dev/null +++ b/examples/batchMatMulOpt/linalg-batch-matmul-f32-2.mlir @@ -0,0 +1,73 @@ +// RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm \ +// RUN: -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-loops -convert-scf-to-cf -llvm-request-c-wrappers \ +// RUN: -convert-func-to-llvm -reconcile-unrealized-casts %s \ +// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_f32 -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +#map = affine_map<(d0) -> (d0 mod 64)> +#map1 = affine_map<(d0) -> (d0 floordiv 64)> +#map2 = affine_map<(d0) -> (d0)> +#map3 = affine_map<(d0) -> (d0 * 64)> +#set = affine_set<(d0)[s0] : (d0 * -64 + s0 - 64 >= 0)> + +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +func.func @conv_2d(%arg0: memref<1x1x1024xf32>, %arg1: memref<1x1024x1000xf32>, %arg2: memref<1x1x1000xf32>) { + linalg.batch_matmul + ins(%arg0, %arg1 : memref<1x1x1024xf32>, memref<1x1024x1000xf32>) + outs(%arg2 : memref<1x1x1000xf32>) + return +} + +func.func @main(){ + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c576 = arith.constant 576 : index + %c1024 = arith.constant 1024 : index + %c1000 = arith.constant 1000 : index + %f0 = arith.constant 0.0 : f32 + %f2 = arith.constant 2.0 : f32 + %f3 = arith.constant 3.0 : f32 + + %a = memref.alloc() : memref<1x1x1024xf32> + scf.for %arg0 = %c0 to %c1 step %c1 { + scf.for %arg1 = %c0 to %c1 step %c1 { + scf.for %arg2 = %c0 to %c1024 step %c1 { + memref.store %f3, %a[%arg0, %arg1, %arg2] : memref<1x1x1024xf32> + } + } + } + + %b = memref.alloc() : memref<1x1024x1000xf32> + scf.for %arg0 = %c0 to %c1 step %c1 { + scf.for %arg1 = %c0 to %c1024 step %c1 { + scf.for %arg2 = %c0 to %c1000 step %c1 { + memref.store %f2, %b[%arg0, %arg1, %arg2] : memref<1x1024x1000xf32> + } + } + } + + %c = memref.alloc() : memref<1x1x1000xf32> + scf.for %arg0 = %c0 to %c1 step %c1 { + scf.for %arg1 = %c0 to %c1 step %c1 { + scf.for %arg2 = %c0 to %c1000 step %c1 { + memref.store %f0, %c[%arg0, %arg1, %arg2] : memref<1x1x1000xf32> + } + } + } + + linalg.batch_matmul + ins(%a, %b : memref<1x1x1024xf32>, memref<1x1024x1000xf32>) + outs(%c : memref<1x1x1000xf32>) + + %printed_c = memref.cast %c : memref<1x1x1000xf32> to memref<*xf32> + call @printMemrefF32(%printed_c) : (memref<*xf32>) -> () + // CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 3 offset = 0 sizes = \[2, 2, 4\] strides = \[8, 4, 1\] data =}} + // CHECK{LITERAL}: [[[98, 226, 292, 164], + // CHECK{LITERAL}: [12, 76, 96, 56]], + // CHECK{LITERAL}: [[48, 162, 72, 156], + // CHECK{LITERAL}: [16, 112, 0, 104]]] + return +} diff --git a/examples/batchMatMulOpt/makefile b/examples/batchMatMulOpt/makefile new file mode 100644 index 0000000000..68bb28a5b3 --- /dev/null +++ b/examples/batchMatMulOpt/makefile @@ -0,0 +1,171 @@ +#!/bin/bash +BUDDY_OPT := ../../build/bin/buddy-opt +MLIR_OPT := ../../llvm/build/bin/mlir-opt +MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate +MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner +LLC := ../../llvm/build/bin/llc +OPT_FLAG := -O0 + +ifeq ($(shell uname),Linux) +MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so +MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so +MTRIPLE := x86_64-unknown-linux-gnu +else ifeq ($(shell uname),Darwin) +MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib +MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib +MTRIPLE := x86_64-apple-darwin +endif + +batch-matmul-1-lower: + @${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \ + ${BUDDY_OPT} \ + -convert-elementwise-to-linalg \ + -func-bufferize-dynamic-offset \ + -arith-bufferize \ + -func-bufferize \ + -tensor-bufferize \ + -linalg-bufferize \ + -finalizing-bufferize \ + -batchmatmul-optimize \ + -convert-linalg-to-affine-loops \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-scf-to-cf \ + -llvm-request-c-wrappers \ + -convert-vector-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -expand-strided-metadata \ + -finalize-memref-to-llvm \ + -reconcile-unrealized-casts \ + -o ./log.mlir + + +batch-matmul-1-translate: + @${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \ + ${BUDDY_OPT} \ + -convert-elementwise-to-linalg \ + -func-bufferize-dynamic-offset \ + -arith-bufferize \ + -func-bufferize \ + -tensor-bufferize \ + -linalg-bufferize \ + -finalizing-bufferize \ + -batchmatmul-optimize \ + -convert-linalg-to-affine-loops \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-scf-to-cf \ + -llvm-request-c-wrappers \ + -convert-vector-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -expand-strided-metadata \ + -finalize-memref-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + +batch-matmul-1-run: + @${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \ + ${BUDDY_OPT} \ + -convert-elementwise-to-linalg \ + -func-bufferize-dynamic-offset \ + -arith-bufferize \ + -func-bufferize \ + -tensor-bufferize \ + -linalg-bufferize \ + -finalizing-bufferize \ + -batchmatmul-optimize \ + -convert-linalg-to-affine-loops \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-scf-to-cf \ + -llvm-request-c-wrappers \ + -convert-vector-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -expand-strided-metadata \ + -finalize-memref-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + + +batch-matmul-2-lower: + @${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \ + ${BUDDY_OPT} \ + -convert-elementwise-to-linalg \ + -func-bufferize-dynamic-offset \ + -arith-bufferize \ + -func-bufferize \ + -tensor-bufferize \ + -linalg-bufferize \ + -finalizing-bufferize \ + -batchmatmul-optimize \ + -o ./log.mlir + + +batch-matmul-2-translate: + @${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \ + ${BUDDY_OPT} \ + -convert-elementwise-to-linalg \ + -func-bufferize-dynamic-offset \ + -arith-bufferize \ + -func-bufferize \ + -tensor-bufferize \ + -linalg-bufferize \ + -finalizing-bufferize \ + -batchmatmul-optimize \ + -convert-linalg-to-affine-loops \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-scf-to-cf \ + -llvm-request-c-wrappers \ + -convert-vector-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -expand-strided-metadata \ + -finalize-memref-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + +batch-matmul-2-run: + @${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \ + ${BUDDY_OPT} \ + -convert-elementwise-to-linalg \ + -func-bufferize-dynamic-offset \ + -arith-bufferize \ + -func-bufferize \ + -tensor-bufferize \ + -linalg-bufferize \ + -finalizing-bufferize \ + -batchmatmul-optimize \ + -convert-linalg-to-affine-loops \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-scf-to-cf \ + -llvm-request-c-wrappers \ + -convert-vector-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -expand-strided-metadata \ + -finalize-memref-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} \ No newline at end of file diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp index 757ac8ae91..864c30c850 100644 --- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp +++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp @@ -105,7 +105,7 @@ class BatchMatMulOptimizePattern : public ConversionPattern { // Apply the column of matrix B. Value appliedColOfB = rewriter.create( - loc, AffineMap::get(1, 0, d0.ceilDiv(affineVectorSize)), + loc, AffineMap::get(1, 0, d0.floorDiv(affineVectorSize)), ValueRange{bCol}); // Create the primary parallel batch level loop.