buddy-compiler · NaOHCC · Oct 21, 2024 · Oct 21, 2024 · zhanghb97 · Oct 22, 2024
diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
@@ -57,6 +57,45 @@ SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
 add_executable(buddy-lenet-run buddy-lenet-main.cpp)
 target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR})
 
+if(NOT DEFINED BUDDY_ENABLE_PNG)
+  message(FATAL_ERROR "To run LeNet inference, the png library is required. Please define BUDDY_ENABLE_PNG for CMake.")
+endif()
 set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${PNG_LIBRARIES})
 
 target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS})
+
+set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map")
+set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin")
+add_custom_command(
+  OUTPUT subgraph0_gpu.o
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
+          -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION}
+          -buffer-deallocation
+          -convert-linalg-to-parallel-loops
+          -canonicalize
+          -gpu-map-parallel-loops
+          -convert-parallel-loops-to-gpu
+          -gpu-kernel-outlining
+          -canonicalize
+          -cse |
+          ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+          ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0_gpu.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+  COMMENT "Building subgraph0_gpu.o"
+  VERBATIM)
+
+add_library(LENET_GPU STATIC subgraph0_gpu.o forward.o)
+
+SET_TARGET_PROPERTIES(LENET_GPU PROPERTIES LINKER_LANGUAGE C)
+
+add_executable(buddy-lenet-run-gpu buddy-lenet-main.cpp)
+target_link_directories(buddy-lenet-run-gpu PRIVATE ${LLVM_LIBRARY_DIR})
+
+set(BUDDY_LENET_LIBS_GPU LENET_GPU mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime ${PNG_LIBRARIES})
+
+target_link_libraries(buddy-lenet-run-gpu ${BUDDY_LENET_LIBS_GPU})
diff --git a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
@@ -18,21 +18,17 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include <mlir/Dialect/Affine/IR/AffineOps.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
 #include <mlir/Dialect/Linalg/Transforms/Transforms.h>
@@ -42,11 +38,8 @@
 #include <mlir/IR/Value.h>
 #include <mlir/Pass/Pass.h>
 
-#include <map>
-#include <set>
-#include <sstream>
-#include <string>
-#include <utility>
+#include <vector>
+
 using namespace mlir;
 using namespace vector;
 
@@ -82,6 +75,9 @@ class ConvertMemcpyToGPUPass
 void ConvertMemcpyToGPUPass::runOnOperation() {
   auto funcOp = getOperation();
 
+  if (funcOp.isDeclaration() || funcOp.isExternal())
+    return;
+
   // Make sure the gpu function is already outlined.
   funcOp->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
     if (auto gpuLaunchOp = dyn_cast<gpu::LaunchOp>(nestedOp)) {
@@ -90,8 +86,9 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
     return WalkResult::advance();
   });
 
-  std::set<gpu::AllocOp *> unDeallocatedOperations;
+  std::vector<Value> unDeallocatedValue;
   OpBuilder builder(funcOp->getContext());
+
   // Copy all function arguments to gpu, needs deallocation
   if (processArgs) {
     builder.setInsertionPointToStart(&(funcOp.getBody().front()));
@@ -103,23 +100,11 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
       auto memrefType = dyn_cast<MemRefType>(arg.getType());
       auto gpuAllocOp = builder.create<gpu::AllocOp>(
           builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({}));
-      unDeallocatedOperations.insert(&gpuAllocOp);
+      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
       auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
           gpuAllocOp.getLoc(), TypeRange(), ValueRange(),
           gpuAllocOp.getResult(0), arg);
-      // Replace all users with GPU memory
-      auto users = arg.getUsers();
-      std::vector<Operation *> usersVec(users.begin(), users.end());
-      for (auto user : usersVec) {
-        // Don't replace memcpy's operand
-        if (isa<gpu::MemcpyOp>(user))
-          continue;
-        for (size_t j = 0; j < user->getNumOperands(); j++) {
-          if (user->getOperand(j) == arg) {
-            user->setOperand(j, gpuAllocOp.getResult(0));
-          }
-        }
-      }
+      arg.replaceAllUsesExcept(gpuAllocOp->getResult(0), gpuMemcpyOp);
     }
   }
 
@@ -149,19 +134,18 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
 
       auto gpuAllocOp = builder.create<gpu::AllocOp>(
           allocOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
-      auto users = result.getUsers();
-      std::vector<Operation *> usersVec(users.begin(), users.end());
-      for (auto user : usersVec) {
-        for (size_t j = 0; j < user->getNumOperands(); j++) {
-          // Only the return value will not have dealloc op
-          if (auto deallocOp = dyn_cast<memref::DeallocOp>(user)) {
-            builder.setInsertionPointAfter(deallocOp);
-            auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
-                deallocOp->getLoc(), TypeRange(), ValueRange(),
-                gpuAllocOp.getResult(0));
-            deallocOp->erase();
-          } else if (user->getOperand(j) == result) {
-            user->setOperand(j, gpuAllocOp.getResult(0));
+
+      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
+        if (auto deallocOp = dyn_cast<memref::DeallocOp>(user)) {
+          builder.setInsertionPointAfter(deallocOp);
+          builder.create<gpu::DeallocOp>(deallocOp->getLoc(), TypeRange(),
+                                         ValueRange(), gpuAllocOp.getResult(0));
+          deallocOp->erase();
+        } else {
+          for (auto &opOperand : user->getOpOperands()) {
+            if (opOperand.is(result)) {
+              opOperand.set(gpuAllocOp.getResult(0));
+            }
           }
         }
       }
@@ -175,28 +159,8 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
       builder.setInsertionPointAfter(copyOp);
       auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
           copyOp->getLoc(), TypeRange(), ValueRange(), dst, src);
-      {
-        auto users = src.getUsers();
-        std::vector<Operation *> usersVec(users.begin(), users.end());
-        for (auto user : usersVec) {
-          for (size_t j = 0; j < user->getNumOperands(); j++) {
-            if (user->getOperand(j) == src) {
-              user->setOperand(j, gpuMemcpyOp.getOperand(1));
-            }
-          }
-        }
-      }
-      {
-        auto users = dst.getUsers();
-        std::vector<Operation *> usersVec(users.begin(), users.end());
-        for (auto user : usersVec) {
-          for (size_t j = 0; j < user->getNumOperands(); j++) {
-            if (user->getOperand(j) == src) {
-              user->setOperand(j, gpuMemcpyOp.getOperand(0));
-            }
-          }
-        }
-      }
+      src.replaceAllUsesWith(gpuMemcpyOp->getResult(1));
+      dst.replaceAllUsesWith(gpuMemcpyOp->getResult(0));
       copyOp->erase();
     }
     // Allocate space on GPU and copy global memrefs to GPU, needs deallocation
@@ -206,47 +170,34 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
       auto memrefType = dyn_cast<MemRefType>(result.getType());
       auto gpuAllocOp = builder.create<gpu::AllocOp>(
           getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
-      unDeallocatedOperations.insert(&gpuAllocOp);
+      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
+
       auto src = result;
       auto dst = gpuAllocOp->getResult(0);
       auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
           gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src);
-      {
-        auto users = src.getUsers();
-        std::vector<Operation *> usersVec(users.begin(), users.end());
-        for (auto user : usersVec) {
-          if (isa<gpu::MemcpyOp>(user))
-            continue;
-          // TODO: replace with src.replaceAllUsesExcept()
-          for (size_t j = 0; j < user->getNumOperands(); j++) {
-            if (user->getOperand(j) == src) {
-              user->setOperand(j, dst);
-            }
-          }
-        }
-      }
+      src.replaceAllUsesExcept(dst, gpuMemcpyOp);
     }
     // Copy data back to CPU, deallocate GPU, then return
     else if (auto returnOp = dyn_cast<func::ReturnOp>(nestedOp)) {
       builder.setInsertionPoint(returnOp);
-
-      for (auto *gpuAllocOp : unDeallocatedOperations) {
-        auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
-            builder.getUnknownLoc(), TypeRange(), ValueRange(),
-            gpuAllocOp->getResult(0));
-      }
-      builder.setInsertionPoint(returnOp);
       for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) {
         auto val = returnOp->getOperand(i);
-        auto memRefType = dyn_cast<MemRefType>(val.getType());
-        auto allocOp = builder.create<memref::AllocOp>(builder.getUnknownLoc(),
-                                                       memRefType);
-        auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
-            allocOp.getLoc(), TypeRange(), ValueRange(), allocOp->getResult(0),
-            val);
-        auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
-            gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val);
-        returnOp->setOperand(i, allocOp->getResult(0));
+        if (auto memrefType = dyn_cast<MemRefType>(val.getType())) {
+          auto allocOp =
+              builder.create<memref::AllocOp>(returnOp->getLoc(), memrefType);
+          builder.create<gpu::MemcpyOp>(allocOp.getLoc(), TypeRange(),
+                                        ValueRange(), allocOp->getResult(0),
+                                        val);
+          // FIXME: may be leak memory
+          // auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
+          //     gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val);
+          returnOp->setOperand(i, allocOp->getResult(0));
+        }
+      }
+      for (auto value : unDeallocatedValue) {
+        builder.create<gpu::DeallocOp>(returnOp->getLoc(), TypeRange(),
+                                       ValueRange(), value);
       }
     }
     return WalkResult::advance();

diff --git a/tests/Conversion/convert-memcpy-to-gpu.mlir b/tests/Conversion/convert-memcpy-to-gpu.mlir
@@ -1,22 +1,68 @@
-// RUN: buddy-opt -convert-memcpy-to-gpu -canonicalize %s | FileCheck %s
+// RUN: buddy-opt -convert-memcpy-to-gpu="process-args=1" %s | FileCheck %s
 
-// CHECK: %memref = gpu.alloc  () : memref<32x32xf32>
-// CHECK: %memref_0 = gpu.alloc  () : memref<32x32xf32>
-// CHECK: gpu.dealloc  %memref : memref<32x32xf32>
-// CHECK: %alloc = memref.alloc() : memref<32x32xf32>
-// CHECK: gpu.memcpy  %alloc, %memref_0 : memref<32x32xf32>, memref<32x32xf32>
-// CHECK: gpu.dealloc  %memref_0 : memref<32x32xf32>
+#map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
 module attributes {gpu.container_module} {
-  func.func @matmul(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>) -> memref<32x32xf32> {
-    %c2 = arith.constant 2 : index
-    %c64 = arith.constant 64 : index
+  memref.global "private" constant @__constant_1x10x10xf32 : memref<1x10x10xf32> = dense<1.000000e+00> {alignment = 64 : i64}
+  func.func @matmul(%arg0: memref<1x10x10xf32>, %arg1: memref<1x10x10xf32>) -> memref<1x10x10xf32> {
+    // CHECK: %[[d_arg0:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    // CHECK-NEXT: gpu.memcpy  %[[d_arg0]], %arg0 : memref<1x10x10xf32>, memref<1x10x10xf32>
+    // CHECK: %[[d_arg1:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    // CHECK-NEXT: gpu.memcpy  %[[d_arg1:.*]], %arg1 : memref<1x10x10xf32>, memref<1x10x10xf32>
+    %c10 = arith.constant 10 : index
+    %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32x32xf32>
-    gpu.launch_func  @matmul_kernel::@matmul_kernel blocks in (%c1, %c1, %c1) threads in (%c64, %c2, %c1)  
-    return %alloc : memref<32x32xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    // CHECK: %[[h_global_data:.*]] = memref.get_global @__constant_1x10x10xf32 : memref<1x10x10xf32>
+    // CHECK: %[[d_global_data:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    // CHECK: gpu.memcpy  %[[d_global_data]], %[[h_global_data]] : memref<1x10x10xf32>, memref<1x10x10xf32>
+    %0 = memref.get_global @__constant_1x10x10xf32 : memref<1x10x10xf32>
+    // CHECK: %[[d_alloc0:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32>
+    // CHECK: gpu.launch_func
+    gpu.launch_func  @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %cst : f32, %alloc : memref<1x10x10xf32>)
+    // CHECK: gpu.launch_func
+    // CHECK-SAME: %[[d_arg0]]
+    // CHECK-SAME: %[[d_arg1]]
+    // CHECK-SAME: %[[d_alloc0]]
+    gpu.launch_func  @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %arg0 : memref<1x10x10xf32>, %arg1 : memref<1x10x10xf32>, %alloc : memref<1x10x10xf32>, %c10 : index)
+    // CHECK: %[[d_alloc1:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32>
+    // CHECK: gpu.launch_func
+    gpu.launch_func  @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %cst : f32, %alloc_0 : memref<1x10x10xf32>)
+    // CHECK: gpu.launch_func
+    // CHECK-SAME: %[[d_global_data]]
+    // CHECK-SAME: %[[d_alloc0]]
+    // CHECK-SAME: %[[d_alloc1]]
+    gpu.launch_func  @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %0 : memref<1x10x10xf32>, %alloc : memref<1x10x10xf32>, %alloc_0 : memref<1x10x10xf32>, %c10 : index)
+    // CHECK: %[[d_result:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32>
+    // CHECK: gpu.launch_func
+    gpu.launch_func  @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %cst : f32, %alloc_1 : memref<1x10x10xf32>)
+    // CHECK: gpu.launch_func
+    // CHECK-SAME: %[[d_alloc0]]
+    // CHECK-SAME: %[[d_alloc1]]
+    // CHECK-SAME: %[[d_result]]
+    gpu.launch_func  @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %alloc : memref<1x10x10xf32>, %alloc_0 : memref<1x10x10xf32>, %alloc_1 : memref<1x10x10xf32>, %c10 : index)
+    // CHECK: gpu.dealloc %[[d_alloc1]] : memref<1x10x10xf32>
+    memref.dealloc %alloc_0 : memref<1x10x10xf32>
+    // CHECK: gpu.dealloc  %[[d_alloc0]] : memref<1x10x10xf32>
+    memref.dealloc %alloc : memref<1x10x10xf32>
+
+    // CHECK: %[[h_alloc:.*]] = memref.alloc() : memref<1x10x10xf32>
+    // CHECK-NEXT: gpu.memcpy  %[[h_alloc]], %[[d_result]] : memref<1x10x10xf32>, memref<1x10x10xf32>
+
+    // CHECK: gpu.dealloc  %[[d_arg0]] : memref<1x10x10xf32>
+    // CHECK: gpu.dealloc  %[[d_arg1]] : memref<1x10x10xf32>
+    // CHECK: gpu.dealloc  %[[d_global_data]] : memref<1x10x10xf32>
+
+    // CHECK: return %[[h_alloc]] : memref<1x10x10xf32>
+    return %alloc_1 : memref<1x10x10xf32>
   }
-  gpu.module @matmul_kernel {
-    gpu.func @matmul_kernel() kernel attributes {gpu.known_block_size = array<i32: 64, 2, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {
+  gpu.module @kernel {
+    gpu.func @fill(%arg0: index, %arg1: index, %arg2: f32, %arg3: memref<1x10x10xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
+      gpu.return
+    }
+    gpu.func @matmul(%arg0: index, %arg1: index, %arg2: memref<1x10x10xf32>, %arg3: memref<1x10x10xf32>, %arg4: memref<1x10x10xf32>, %arg5: index) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
       gpu.return
     }
   }