diff --git a/examples/BuddyNext/.gitignore b/examples/BuddyNext/.gitignore index 0194ea7a68..80a243fa81 100644 --- a/examples/BuddyNext/.gitignore +++ b/examples/BuddyNext/.gitignore @@ -1,3 +1 @@ -log.mlir -log.ll -log.s +log.* diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile index 3ee282499c..ce30c81b2b 100644 --- a/examples/BuddyNext/makefile +++ b/examples/BuddyNext/makefile @@ -9,6 +9,7 @@ OPT_FLAG := -O0 ifeq ($(shell uname),Linux) MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so +LIB_OMP := ../../llvm/build/lib/libomp.so MTRIPLE := x86_64-unknown-linux-gnu else ifeq ($(shell uname),Darwin) MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib @@ -313,6 +314,73 @@ next-sgemm-run: ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} +next-transpose-lower: + @${MLIR_OPT} ./next-transpose.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -func-bufferize \ + -arith-bufferize \ + -o log.mlir + +next-transpose-run: + @${MLIR_OPT} ./next-transpose.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -func-bufferize \ + -arith-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-transpose-vec-manual-run: + @${MLIR_OPT} ./next-transpose-vec-manual.mlir \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-scf-to-openmp \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + next-embedding-lower: @${MLIR_OPT} ./next-embedding.mlir \ -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ diff --git a/examples/BuddyNext/next-transpose-vec-manual.mlir b/examples/BuddyNext/next-transpose-vec-manual.mlir new file mode 100644 index 0000000000..ccf5c7b7e4 --- /dev/null +++ b/examples/BuddyNext/next-transpose-vec-manual.mlir @@ -0,0 +1,68 @@ +// RUN: buddy-opt %s \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + memref.global "private" constant @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32> = dense<3.000000e+00> {alignment = 64 : i64} + func.func private @rtclock() -> f64 + func.func private @printMemrefF32(memref<*xf32>) + func.func @kernel(%arg0: memref<1x32x40x128xf32>) { + %0 = call @rtclock() : () -> f64 + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x40x32x128xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 40 { + affine.for %arg3 = 0 to 32 { + affine.for %arg4 = 0 to 128 step 64 { + %3 = vector.load %arg0[%arg1, %arg3, %arg2, %arg4] : memref<1x32x40x128xf32>, vector<64xf32> + vector.store %3, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1x40x32x128xf32>, vector<64xf32> + } + } + } + } + %1 = call @rtclock() : () -> f64 + %2 = arith.subf %1, %0 : f64 + %cast = memref.cast %alloc : memref<1x40x32x128xf32> to memref<*xf32> + + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 40, 32, 128] strides = [163840, 4096, 128, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [ + // CHECK-SAME: [3{{(, 3)*}}], + + call @printMemrefF32(%cast) : (memref<*xf32>) -> () + vector.print %2 : f64 + return + } + func.func @main() { + %0 = memref.get_global @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32> + call @kernel(%0) : (memref<1x32x40x128xf32>) -> () + return + } +} + diff --git a/examples/BuddyNext/next-transpose.mlir b/examples/BuddyNext/next-transpose.mlir new file mode 100644 index 0000000000..1b2bd93d62 --- /dev/null +++ b/examples/BuddyNext/next-transpose.mlir @@ -0,0 +1,70 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 +func.func private @printMemrefF32(%ptr : tensor<*xf32>) + +func.func @kernel(%t0 : tensor<1x32x40x128xf32>) { + %t_start = call @rtclock() : () -> f64 + + %idx = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> + %t1 = tosa.transpose %t0, %idx : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %t1 : tensor<1x40x32x128xf32> to tensor<*xf32> + + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 40, 32, 128] strides = [163840, 4096, 128, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [ + // CHECK-SAME: [3{{(, 3)*}}], + + // Print results. + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + // Print timings. + vector.print %time : f64 + + return +} + +func.func @main() { + %c0 = arith.constant dense<3.0> : tensor<1x32x40x128xf32> + call @kernel(%c0) : (tensor<1x32x40x128xf32>) -> () + + return +}