Add multicore flag (nod-ai#128)

The PackPipeline makes only use of one AIE tile by default, by tiling to `(16,64)` in the first tiling operation. With the new flag `iree-amdaie-num-cores` this behavior can be adapted, namely - `iree-amdaie-num-cores=2`: tile to `(16,128)` and use two AIE tiles - `iree-amdaie-num-cores=4`: tile to `(16,256)` and use four AIE tiles. Values different from 1, 2 or 4 result in an error. This flag is ignored within the PadPipeline and SimplePackPipeline. --------- Co-authored-by: Franz Haniel <[email protected]>
newling · Feb 9, 2024 · 831d3d5 · 831d3d5
1 parent 69b6422
commit 831d3d5
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 12 deletions.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAddLoweringStrategy.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAddLoweringStrategy.cpp
@@ -47,7 +47,10 @@ void AMDAIELoweringStrategyPass::runOnOperation() {
         "Expected a variantOp root with an inner ModuleOp");
     return signalPassFailure();
   }
-  if (failed(initAIELaunchConfig(moduleOp, usePassPipeline))) {
+  // To simplify development, the number of cores can be passed as a flag during
+  // compilation. In the future these parameters could be read from file.
+  struct AIEConfig cfg = {numCores};
+  if (failed(initAIELaunchConfig(moduleOp, usePassPipeline, cfg))) {
     return signalPassFailure();
   }
 }

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
@@ -20,7 +20,8 @@ namespace mlir::iree_compiler::AMDAIE {
 /// implements the contraction operation interface.
 static LogicalResult setRootConfig(func::FuncOp entryPointFn,
                                    linalg::MatmulOp matmulOp,
-                                   AIEPassPipeline usePassPipeline) {
+                                   AIEPassPipeline usePassPipeline,
+                                   AIEConfig cfg) {
   assert(!getLoweringConfig(matmulOp) && "expected lowering_config is not set");
   auto linalgOp = cast<linalg::LinalgOp>(matmulOp.getOperation());
   unsigned numLoops = linalgOp.getNumLoops();
@@ -55,7 +56,9 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
         entryPointFn, matmulOp, tileSizes,
         IREE::Codegen::DispatchLoweringPassPipeline::None);
   } else if (usePassPipeline == AIEPassPipeline::PackPipeline) {
-    SmallVector<int64_t> TileSizeLevel0 = {16, 64};
+    if (!(cfg.num_cores == 1 || cfg.num_cores == 2 || cfg.num_cores == 4))
+      return matmulOp.emitOpError("unhandled number of cores");
+    SmallVector<int64_t> TileSizeLevel0 = {16, 64 * cfg.num_cores};
     SmallVector<int64_t> TileSizeLevel1 = {0, 0, 64};
     SmallVector<int64_t> TileSizeLevel2 = {1, 1};
     TileSizesListType tileSizes = {TileSizeLevel0, TileSizeLevel1,
@@ -69,14 +72,15 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
 
 /// Redirects to methods that set the configuration based on operation type.
 static LogicalResult setRootConfigImpl(func::FuncOp entryPointFn, Operation *op,
-                                       AIEPassPipeline usePassPipeline) {
+                                       AIEPassPipeline usePassPipeline,
+                                       AIEConfig cfg) {
   auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
     return TypeSwitch<Operation *, LogicalResult>(op)
         // TODO (nmeshram): This is very limited for now, plan is to
         // let it first crash for all the other ops and then consiously
         // add support for them, this way we can verify our work.
         .Case<linalg::MatmulOp>([&](auto op) {
-          return setRootConfig(entryPointFn, op, usePassPipeline);
+          return setRootConfig(entryPointFn, op, usePassPipeline, cfg);
         })
         .Default([&](Operation *op) { return success(); });
   };
@@ -86,7 +90,7 @@ static LogicalResult setRootConfigImpl(func::FuncOp entryPointFn, Operation *op,
 /// Sets the translation information to use for a dispatch region.
 static LogicalResult setTranslationInfoAndRootConfig(
     func::FuncOp entryPointFn, ArrayRef<Operation *> computeOps,
-    AIEPassPipeline usePassPipeline) {
+    AIEPassPipeline usePassPipeline, AIEConfig cfg) {
   // Make sure that lowering_config is not preset on any compute ops.
   for (auto computeOp : computeOps) {
     if (getLoweringConfig(computeOp)) return failure();
@@ -101,7 +105,8 @@ static LogicalResult setTranslationInfoAndRootConfig(
     return entryPointFn.emitError("Case with no root ops not yet supported.");
   }
 
-  if (failed(setRootConfigImpl(entryPointFn, rootOperation, usePassPipeline))) {
+  if (failed(setRootConfigImpl(entryPointFn, rootOperation, usePassPipeline,
+                               cfg))) {
     return failure();
   }
 
@@ -112,7 +117,8 @@ static LogicalResult setTranslationInfoAndRootConfig(
 }
 
 LogicalResult initAIELaunchConfig(ModuleOp moduleOp,
-                                  AIEPassPipeline usePassPipeline) {
+                                  AIEPassPipeline usePassPipeline,
+                                  AIEConfig cfg) {
   llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
       getAllEntryPoints(moduleOp);
   for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
@@ -127,7 +133,7 @@ LogicalResult initAIELaunchConfig(ModuleOp moduleOp,
 
     SmallVector<Operation *> computeOps = getComputeOps(funcOp);
     if (failed(setTranslationInfoAndRootConfig(funcOp, computeOps,
-                                               usePassPipeline))) {
+                                               usePassPipeline, cfg))) {
       return failure();
     }
   }

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h
@@ -22,9 +22,15 @@ enum class AIEPassPipeline : int32_t {
   None = 3
 };
 
-LogicalResult initAIELaunchConfig(ModuleOp moduleOp,
-                                  AIEPassPipeline usePassPipeline);
+/// Struct specifying the number of cores to use. This will be replaced
+/// by a more versatile handling in the future.
+struct AIEConfig {
+  int32_t num_cores;
+};
 
+LogicalResult initAIELaunchConfig(ModuleOp moduleOp,
+                                  AIEPassPipeline usePassPipeline,
+                                  AIEConfig cfg);
 }  // namespace mlir::iree_compiler::AMDAIE
 
 #endif  // IREE_AMD_AIE_TRANSFORMS_KERNELDISPATCH_H_
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -37,6 +37,10 @@ static llvm::cl::opt<AIEPassPipeline> clUsePipeline(
                    "pack operation")),
     llvm::cl::init(AIEPassPipeline::SimplePackPipeline));
 
+static llvm::cl::opt<int32_t> clNumCores(
+    "iree-amdaie-num-cores",
+    llvm::cl::desc("Choose the number of cores to use"), llvm::cl::init(1));
+
 //===---------------------------------------------------------------------===//
 // Default allocation functions for AIE backend
 //===---------------------------------------------------------------------===//
@@ -271,6 +275,7 @@ void buildAMDAIETransformPassPipeline(OpPassManager &pm) {
   {
     AMDAIELoweringStrategyOptions options;
     options.usePassPipeline = clUsePipeline;
+    options.numCores = clNumCores;
     pm.addPass(createAMDAIELoweringStrategyPass(options));
   }
   {

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -94,7 +94,9 @@ def AMDAIELoweringStrategy :
                    "Use the more advanced pack-based lowering strategy, including peeling and double-buffering."),
         clEnumValN(mlir::iree_compiler::AMDAIE::AIEPassPipeline::SimplePackPipeline, "simple-pack",
                    "Use the simple-pack based lowering strategy.")
-      )}]>  
+      )}]>,
+    Option<"numCores", "num-cores", "int32_t", /*default=*/"1",
+      "Choose the number of cores to use">
   ];
 }
 

diff --git a/tests/samples/CMakeLists.txt b/tests/samples/CMakeLists.txt
@@ -9,6 +9,8 @@ iree_lit_test_suite(
     lit
   SRCS
     "pack_pipeline_funcIR.mlir"
+    "pack_pipeline_funcIR_2core.mlir"
+    "pack_pipeline_funcIR_4core.mlir"
     "pad_pipeline_e2e.mlir"
     "simple_pack_pipeline_e2e.mlir"
   TOOLS

diff --git a/tests/samples/pack_pipeline_funcIR_2core.mlir b/tests/samples/pack_pipeline_funcIR_2core.mlir
@@ -0,0 +1,48 @@
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --mlir-print-ir-after=fold-memref-alias-ops %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-use-pipeline=pack --iree-amdaie-num-cores=2 | FileCheck %s
+
+func.func @matmul_example(%lhs: tensor<16x256xi8>, %rhs: tensor<256x256xi8>) -> tensor<16x256xi32>
+{
+  %empty = tensor.empty() : tensor<16x256xi32>
+  %cst = arith.constant 0 : i32
+  %fill = linalg.fill ins(%cst : i32) outs(%empty : tensor<16x256xi32>) -> tensor<16x256xi32>
+  %res = linalg.matmul ins(%lhs, %rhs: tensor<16x256xi8>, tensor<256x256xi8>)
+                    outs(%fill: tensor<16x256xi32>) -> tensor<16x256xi32>
+  return %res : tensor<16x256xi32>
+}
+
+// CHECK-LABEL: @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32
+//       CHECK: memref.alloc() : memref<1x1x8x4x4x8xi32, 2 : i32>
+//       CHECK: memref.alloc() : memref<1x1x8x8x8x8xi8, 2 : i32>
+//       CHECK: memref.alloc() : memref<1x1x8x4x4x8xi8, 2 : i32>
+//       CHECK: memref.alloc() : memref<1x2x16x64xi32, 1 : i32>
+//       CHECK: memref.alloc() : memref<1x2x64x64xi8, 1 : i32>
+//       CHECK: memref.alloc() : memref<1x1x16x64xi8, 1 : i32>
+//       CHECK: scf.forall
+//       CHECK:   iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
+//       CHECK:   iree_linalg_ext.pack %{{.*}} : (memref<64x128xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xi8, 1 : i32>)
+//       CHECK:   scf.forall
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
+//       CHECK:     linalg.fill
+//       CHECK:     linalg.generic
+//       CHECK:     iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[2048, 1024, 64, 1], offset: ?>, 1 : i32>)
+//       CHECK:   iree_linalg_ext.unpack %{{.*}} : (memref<1x2x16x64xi32, 1 : i32> memref<16x128xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
+//       CHECK:   scf.for
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<64x128xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xi8, 1 : i32>)
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<16x128xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x16x64xi32, 1 : i32>)
+//       CHECK:     scf.forall
+//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
+//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
+//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi32, strided<[2048, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi32, 2 : i32>)
+//       CHECK:       linalg.generic
+//       CHECK:       iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[2048, 1024, 64, 1], offset: ?>, 1 : i32>)
+//       CHECK:     iree_linalg_ext.unpack %{{.*}} : (memref<1x2x16x64xi32, 1 : i32> memref<16x128xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x16x64xi8, 1 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x2x64x64xi8, 1 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x2x16x64xi32, 1 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi8, 2 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x8x8x8xi8, 2 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi32, 2 : i32>
+
+
diff --git a/tests/samples/pack_pipeline_funcIR_4core.mlir b/tests/samples/pack_pipeline_funcIR_4core.mlir
@@ -0,0 +1,48 @@
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --mlir-print-ir-after=fold-memref-alias-ops %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-use-pipeline=pack --iree-amdaie-num-cores=4 | FileCheck %s
+
+func.func @matmul_example(%lhs: tensor<16x256xi8>, %rhs: tensor<256x256xi8>) -> tensor<16x256xi32>
+{
+  %empty = tensor.empty() : tensor<16x256xi32>
+  %cst = arith.constant 0 : i32
+  %fill = linalg.fill ins(%cst : i32) outs(%empty : tensor<16x256xi32>) -> tensor<16x256xi32>
+  %res = linalg.matmul ins(%lhs, %rhs: tensor<16x256xi8>, tensor<256x256xi8>)
+                    outs(%fill: tensor<16x256xi32>) -> tensor<16x256xi32>
+  return %res : tensor<16x256xi32>
+}
+
+// CHECK-LABEL: @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32
+//       CHECK: memref.alloc() : memref<1x1x8x4x4x8xi32, 2 : i32>
+//       CHECK: memref.alloc() : memref<1x1x8x8x8x8xi8, 2 : i32>
+//       CHECK: memref.alloc() : memref<1x1x8x4x4x8xi8, 2 : i32>
+//       CHECK: memref.alloc() : memref<1x4x16x64xi32, 1 : i32>
+//       CHECK: memref.alloc() : memref<1x4x64x64xi8, 1 : i32>
+//       CHECK: memref.alloc() : memref<1x1x16x64xi8, 1 : i32>
+//       CHECK: scf.forall
+//       CHECK:   iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
+//       CHECK:   iree_linalg_ext.pack %{{.*}} : (memref<64x256xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x64x64xi8, 1 : i32>)
+//       CHECK:   scf.forall
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
+//       CHECK:     linalg.fill
+//       CHECK:     linalg.generic
+//       CHECK:     iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1 : i32>)
+//       CHECK:   iree_linalg_ext.unpack %{{.*}} : (memref<1x4x16x64xi32, 1 : i32> memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
+//       CHECK:   scf.for
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<64x256xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x64x64xi8, 1 : i32>)
+//       CHECK:     iree_linalg_ext.pack %{{.*}} : (memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x16x64xi32, 1 : i32>)
+//       CHECK:     scf.forall
+//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
+//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
+//       CHECK:       iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi32, 2 : i32>)
+//       CHECK:       linalg.generic
+//       CHECK:       iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1 : i32>)
+//       CHECK:     iree_linalg_ext.unpack %{{.*}} : (memref<1x4x16x64xi32, 1 : i32> memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x16x64xi8, 1 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x4x64x64xi8, 1 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x4x16x64xi32, 1 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi8, 2 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x8x8x8xi8, 2 : i32>
+//       CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi32, 2 : i32>
+
+