Merge branch 'poolingnhwcmax' of https://github.com/FloatingcloudKnig…

…ht/buddy-mlir into poolingnhwcmax
buddy-compiler · Jan 7, 2025 · 9233ea7 · 9233ea7
2 parents 35d4de3 + f2161e3
commit 9233ea7
Show file tree

Hide file tree

Showing 69 changed files with 3,174 additions and 77 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,59 @@
+repos:
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v19.1.3  # Use the version of clang-format you have installed
+    hooks:
+      - id: clang-format
+        name: clang-format C++ code
+        files: \.(cpp|hpp|cc|cxx|h|c|hxx)$
+        args: [--style=llvm]  # You can set your preferred style here
+
+  # - repo: https://github.com/pocc/pre-commit-hooks
+  #   rev: v1.3.5  # Use the latest stable version
+  #   hooks:
+  #     - id: clang-tidy
+  #       name: clang-tidy C++ code
+  #       files: \.(cpp|hpp|cc|cxx|h|c|hxx)$
+  #       args:
+  #         - --quiet
+  #         - --checks=*,-clang-diagnostic-*,-clang-analyzer-*
+  #         - --extra-arg=-x
+  #         - --extra-arg=c++
+  #         - --extra-arg=-std=c++17
+  #         - --warnings-as-errors=*
+  #         - --extra-arg=-I.
+
+  - repo: https://github.com/psf/black
+    rev: 24.10.0  # Use the latest stable version
+    hooks:
+      - id: black
+        language_version: python3.10
+
+  # a comprehensive tool for checking the style and quality of Python code. 
+  # It combines three popular Python tools:
+  # PyFlakes: Checks for logical errors in the code.
+  # pycodestyle (formerly known as pep8): Checks for adherence to the PEP 8 style guide.
+  # McCabe Complexity Checker: Measures the complexity of your code.
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.1.0  # Set the rev to match the desired flake8 version
+    hooks:
+      - id: flake8
+        args: 
+          - --max-line-length=88  # Adjust as per your style guide
+          - --ignore=F821,F403,F405,F401,W503,E203,E402,E401,W605,E712,E711,F841
+          # W503: Line break before binary operator
+          # E203: Whitespace before colon
+          # E402: Module level import not at top of file
+          # E401: Multiple imports on one line
+          # W605: Invalid escape sequence
+          # E712: Comparison to True/False
+          # E711: Comparison to None
+          # F841: Unused variable
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0  # Updated to the latest version
+    hooks:
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: check-merge-conflict
+      - id: check-yaml
+      - id: check-added-large-files
diff --git a/README.md b/README.md
@@ -195,6 +195,12 @@ This program should be a drop-in replacement for `mlir-lsp-server`, supporting n
 
 After modification, your editor should have correct completion and error prompts for new dialects such as `rvv` and `gemmini`.
 
+### pre-commit checks
+
+The .pre-commit-config.yaml file checks code format and style on each commit, using tools such as clang-format, black, and flake8. You can also run these checks without committing by using "pre-commit run --all-files". This ensures consistent coding standards and prevents common errors before pushing changes.
+
+To get started, you should install pre-commit (e.g., pip install pre-commit) and verify that clang-format, black, and flake8 are available. On Linux, you can use your package manager for clang-format, and pip for Python tools. If you need to revert any unwanted formatting changes, you can use "git stash" or "git restore ." (for all files) or "git restore <file>" (for a specific file), or revert the commit through your Git history.
+
 ## Examples
 
 The purpose of the examples is to give users a better understanding of how to use the passes and the interfaces in buddy-mlir. Currently, we provide three types of examples.

diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -26,7 +26,7 @@
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import simply_fuse
+from buddy.compiler.graph.transform import simply_fuse, apply_classic_fusion
 from buddy.compiler.ops import tosa
 from model import LeNet
 

diff --git a/examples/BuddyLlama/CMakeLists.txt b/examples/BuddyLlama/CMakeLists.txt
@@ -58,18 +58,18 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize
+            -func-bufferize-dynamic-offset
+            -tensor-bufferize
+            -arith-bufferize
+            -buffer-deallocation
+            -finalizing-bufferize
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
             -lower-affine
             -convert-scf-to-openmp
-            -func-bufferize-dynamic-offset
-            -tensor-bufferize
-            -arith-bufferize
-            -buffer-deallocation
-            -finalizing-bufferize
             -convert-vector-to-scf
             -expand-strided-metadata
             -cse

diff --git a/examples/BuddyLlama/import-llama2.py b/examples/BuddyLlama/import-llama2.py
@@ -28,7 +28,7 @@
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
 from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import simply_fuse
+from buddy.compiler.graph.transform import simply_fuse, apply_classic_fusion
 
 # Retrieve the LLaMA model path from environment variables.
 model_path = os.environ.get("LLAMA_MODEL_PATH")

diff --git a/examples/BuddyNext/.gitignore b/examples/BuddyNext/.gitignore
@@ -1,3 +1 @@
-log.mlir
-log.ll
-log.s
+log.*
diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
@@ -9,6 +9,7 @@ OPT_FLAG := -O0
 ifeq ($(shell uname),Linux)
 MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
 MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
+LIB_OMP := ../../llvm/build/lib/libomp.so
 MTRIPLE := x86_64-unknown-linux-gnu
 else ifeq ($(shell uname),Darwin)
 MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
@@ -313,6 +314,73 @@ next-sgemm-run:
     ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
         -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
+next-transpose-lower:
+	@${MLIR_OPT} ./next-transpose.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+		-func-bufferize \
+        -arith-bufferize \
+        -o log.mlir
+
+next-transpose-run:
+	@${MLIR_OPT} ./next-transpose.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+		-func-bufferize \
+        -arith-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-transpose-vec-manual-run:
+	@${MLIR_OPT} ./next-transpose-vec-manual.mlir \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+		-convert-scf-to-openmp \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
 next-embedding-lower:
 	@${MLIR_OPT} ./next-embedding.mlir \
 		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \

diff --git a/examples/BuddyNext/next-transpose-vec-manual.mlir b/examples/BuddyNext/next-transpose-vec-manual.mlir
@@ -0,0 +1,68 @@
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  memref.global "private" constant @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32> = dense<3.000000e+00> {alignment = 64 : i64}
+  func.func private @rtclock() -> f64
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func @kernel(%arg0: memref<1x32x40x128xf32>) {
+    %0 = call @rtclock() : () -> f64
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x40x32x128xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 40 {
+        affine.for %arg3 = 0 to 32 {
+          affine.for %arg4 = 0 to 128 step 64 {
+            %3 = vector.load %arg0[%arg1, %arg3, %arg2, %arg4] : memref<1x32x40x128xf32>, vector<64xf32>
+            vector.store %3, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1x40x32x128xf32>, vector<64xf32>
+          }
+        }
+      }
+    }
+    %1 = call @rtclock() : () -> f64
+    %2 = arith.subf %1, %0 : f64
+    %cast = memref.cast %alloc : memref<1x40x32x128xf32> to memref<*xf32>
+
+    // All the elements of the MemRef are the same,
+    // only check the first line to verify the correctness.
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 40, 32, 128] strides = [163840, 4096, 128, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [
+    // CHECK-SAME: [
+    // CHECK-SAME: [3{{(, 3)*}}],
+
+    call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+    vector.print %2 : f64
+    return
+  }
+  func.func @main() {
+    %0 = memref.get_global @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32>
+    call @kernel(%0) : (memref<1x32x40x128xf32>) -> ()
+    return
+  }
+}
+
diff --git a/examples/BuddyNext/next-transpose.mlir b/examples/BuddyNext/next-transpose.mlir
@@ -0,0 +1,70 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+
+func.func @kernel(%t0 : tensor<1x32x40x128xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  %idx = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %t1 = tosa.transpose %t0, %idx : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %t1 : tensor<1x40x32x128xf32> to tensor<*xf32>
+
+  // All the elements of the MemRef are the same,
+  // only check the first line to verify the correctness.
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 40, 32, 128] strides = [163840, 4096, 128, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [3{{(, 3)*}}],
+
+  // Print results.
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  // Print timings.
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  %c0 = arith.constant dense<3.0> : tensor<1x32x40x128xf32>
+  call @kernel(%c0) : (tensor<1x32x40x128xf32>) -> ()
+
+  return
+}