Skip to content

Commit

Permalink
fix batchMatMul-opt pass bug
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Aug 13, 2024
1 parent c9e14b3 commit f4fcd2c
Show file tree
Hide file tree
Showing 5 changed files with 297 additions and 1 deletion.
1 change: 1 addition & 0 deletions examples/batchMatMulOpt/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
log.*
51 changes: 51 additions & 0 deletions examples/batchMatMulOpt/linalg-batch-matmul-f32-1.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }

func.func @main(){

%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c576 = arith.constant 576 : index
%c1024 = arith.constant 1024 : index
%c1000 = arith.constant 1000 : index
%f2 = arith.constant 2.0 : f32
%f3 = arith.constant 3.0 : f32
%f0 = arith.constant 0.0 : f32

%a = memref.alloc() : memref<1x1x576xf32>
scf.for %arg0 = %c0 to %c1 step %c1 {
scf.for %arg1 = %c0 to %c1 step %c1 {
scf.for %arg2 = %c0 to %c576 step %c1 {
memref.store %f3, %a[%arg0, %arg1, %arg2] : memref<1x1x576xf32>
}
}
}

%b = memref.alloc() : memref<1x576x1024xf32>
scf.for %arg0 = %c0 to %c1 step %c1 {
scf.for %arg1 = %c0 to %c576 step %c1 {
scf.for %arg2 = %c0 to %c1024 step %c1 {
memref.store %f2, %b[%arg0, %arg1, %arg2] : memref<1x576x1024xf32>
}
}
}

%c = memref.alloc() : memref<1x1x1024xf32>
scf.for %arg0 = %c0 to %c1 step %c1 {
scf.for %arg1 = %c0 to %c1 step %c1 {
scf.for %arg2 = %c0 to %c1000 step %c1 {
memref.store %f0, %c[%arg0, %arg1, %arg2] : memref<1x1x1024xf32>
}
}
}

linalg.batch_matmul
ins(%a, %b : memref<1x1x576xf32>, memref<1x576x1024xf32>)
outs(%c : memref<1x1x1024xf32>)


%printed_c = memref.cast %c : memref<1x1x1024xf32> to memref<*xf32>
call @printMemrefF32(%printed_c) : (memref<*xf32>) -> ()

return
}
73 changes: 73 additions & 0 deletions examples/batchMatMulOpt/linalg-batch-matmul-f32-2.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm \
// RUN: -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-loops -convert-scf-to-cf -llvm-request-c-wrappers \
// RUN: -convert-func-to-llvm -reconcile-unrealized-casts %s \
// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_f32 -entry-point-result=void \
// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
// RUN: | FileCheck %s

#map = affine_map<(d0) -> (d0 mod 64)>
#map1 = affine_map<(d0) -> (d0 floordiv 64)>
#map2 = affine_map<(d0) -> (d0)>
#map3 = affine_map<(d0) -> (d0 * 64)>
#set = affine_set<(d0)[s0] : (d0 * -64 + s0 - 64 >= 0)>

func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }

func.func @conv_2d(%arg0: memref<1x1x1024xf32>, %arg1: memref<1x1024x1000xf32>, %arg2: memref<1x1x1000xf32>) {
linalg.batch_matmul
ins(%arg0, %arg1 : memref<1x1x1024xf32>, memref<1x1024x1000xf32>)
outs(%arg2 : memref<1x1x1000xf32>)
return
}

func.func @main(){

%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c576 = arith.constant 576 : index
%c1024 = arith.constant 1024 : index
%c1000 = arith.constant 1000 : index
%f0 = arith.constant 0.0 : f32
%f2 = arith.constant 2.0 : f32
%f3 = arith.constant 3.0 : f32

%a = memref.alloc() : memref<1x1x1024xf32>
scf.for %arg0 = %c0 to %c1 step %c1 {
scf.for %arg1 = %c0 to %c1 step %c1 {
scf.for %arg2 = %c0 to %c1024 step %c1 {
memref.store %f3, %a[%arg0, %arg1, %arg2] : memref<1x1x1024xf32>
}
}
}

%b = memref.alloc() : memref<1x1024x1000xf32>
scf.for %arg0 = %c0 to %c1 step %c1 {
scf.for %arg1 = %c0 to %c1024 step %c1 {
scf.for %arg2 = %c0 to %c1000 step %c1 {
memref.store %f2, %b[%arg0, %arg1, %arg2] : memref<1x1024x1000xf32>
}
}
}

%c = memref.alloc() : memref<1x1x1000xf32>
scf.for %arg0 = %c0 to %c1 step %c1 {
scf.for %arg1 = %c0 to %c1 step %c1 {
scf.for %arg2 = %c0 to %c1000 step %c1 {
memref.store %f0, %c[%arg0, %arg1, %arg2] : memref<1x1x1000xf32>
}
}
}

linalg.batch_matmul
ins(%a, %b : memref<1x1x1024xf32>, memref<1x1024x1000xf32>)
outs(%c : memref<1x1x1000xf32>)

%printed_c = memref.cast %c : memref<1x1x1000xf32> to memref<*xf32>
call @printMemrefF32(%printed_c) : (memref<*xf32>) -> ()
// CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 3 offset = 0 sizes = \[2, 2, 4\] strides = \[8, 4, 1\] data =}}
// CHECK{LITERAL}: [[[98, 226, 292, 164],
// CHECK{LITERAL}: [12, 76, 96, 56]],
// CHECK{LITERAL}: [[48, 162, 72, 156],
// CHECK{LITERAL}: [16, 112, 0, 104]]]
return
}
171 changes: 171 additions & 0 deletions examples/batchMatMulOpt/makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/bin/bash
BUDDY_OPT := ../../build/bin/buddy-opt
MLIR_OPT := ../../llvm/build/bin/mlir-opt
MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
LLC := ../../llvm/build/bin/llc
OPT_FLAG := -O0

ifeq ($(shell uname),Linux)
MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
MTRIPLE := x86_64-unknown-linux-gnu
else ifeq ($(shell uname),Darwin)
MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
MTRIPLE := x86_64-apple-darwin
endif

batch-matmul-1-lower:
@${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
${BUDDY_OPT} \
-convert-elementwise-to-linalg \
-func-bufferize-dynamic-offset \
-arith-bufferize \
-func-bufferize \
-tensor-bufferize \
-linalg-bufferize \
-finalizing-bufferize \
-batchmatmul-optimize \
-convert-linalg-to-affine-loops \
-lower-affine \
-convert-vector-to-scf \
-convert-scf-to-cf \
-llvm-request-c-wrappers \
-convert-vector-to-llvm \
-convert-math-to-llvm \
-convert-math-to-libm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-expand-strided-metadata \
-finalize-memref-to-llvm \
-reconcile-unrealized-casts \
-o ./log.mlir


batch-matmul-1-translate:
@${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
${BUDDY_OPT} \
-convert-elementwise-to-linalg \
-func-bufferize-dynamic-offset \
-arith-bufferize \
-func-bufferize \
-tensor-bufferize \
-linalg-bufferize \
-finalizing-bufferize \
-batchmatmul-optimize \
-convert-linalg-to-affine-loops \
-lower-affine \
-convert-vector-to-scf \
-convert-scf-to-cf \
-llvm-request-c-wrappers \
-convert-vector-to-llvm \
-convert-math-to-llvm \
-convert-math-to-libm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-expand-strided-metadata \
-finalize-memref-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll

batch-matmul-1-run:
@${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
${BUDDY_OPT} \
-convert-elementwise-to-linalg \
-func-bufferize-dynamic-offset \
-arith-bufferize \
-func-bufferize \
-tensor-bufferize \
-linalg-bufferize \
-finalizing-bufferize \
-batchmatmul-optimize \
-convert-linalg-to-affine-loops \
-lower-affine \
-convert-vector-to-scf \
-convert-scf-to-cf \
-llvm-request-c-wrappers \
-convert-vector-to-llvm \
-convert-math-to-llvm \
-convert-math-to-libm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-expand-strided-metadata \
-finalize-memref-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}


batch-matmul-2-lower:
@${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
${BUDDY_OPT} \
-convert-elementwise-to-linalg \
-func-bufferize-dynamic-offset \
-arith-bufferize \
-func-bufferize \
-tensor-bufferize \
-linalg-bufferize \
-finalizing-bufferize \
-batchmatmul-optimize \
-o ./log.mlir


batch-matmul-2-translate:
@${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
${BUDDY_OPT} \
-convert-elementwise-to-linalg \
-func-bufferize-dynamic-offset \
-arith-bufferize \
-func-bufferize \
-tensor-bufferize \
-linalg-bufferize \
-finalizing-bufferize \
-batchmatmul-optimize \
-convert-linalg-to-affine-loops \
-lower-affine \
-convert-vector-to-scf \
-convert-scf-to-cf \
-llvm-request-c-wrappers \
-convert-vector-to-llvm \
-convert-math-to-llvm \
-convert-math-to-libm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-expand-strided-metadata \
-finalize-memref-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll

batch-matmul-2-run:
@${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
${BUDDY_OPT} \
-convert-elementwise-to-linalg \
-func-bufferize-dynamic-offset \
-arith-bufferize \
-func-bufferize \
-tensor-bufferize \
-linalg-bufferize \
-finalizing-bufferize \
-batchmatmul-optimize \
-convert-linalg-to-affine-loops \
-lower-affine \
-convert-vector-to-scf \
-convert-scf-to-cf \
-llvm-request-c-wrappers \
-convert-vector-to-llvm \
-convert-math-to-llvm \
-convert-math-to-libm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-expand-strided-metadata \
-finalize-memref-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class BatchMatMulOptimizePattern : public ConversionPattern {

// Apply the column of matrix B.
Value appliedColOfB = rewriter.create<affine::AffineApplyOp>(
loc, AffineMap::get(1, 0, d0.ceilDiv(affineVectorSize)),
loc, AffineMap::get(1, 0, d0.floorDiv(affineVectorSize)),
ValueRange{bCol});

// Create the primary parallel batch level loop.
Expand Down

0 comments on commit f4fcd2c

Please sign in to comment.