Skip to content

Commit

Permalink
Add multicore flag (nod-ai#128)
Browse files Browse the repository at this point in the history
The PackPipeline makes only use of one AIE tile by default, by tiling to
`(16,64)` in the first tiling operation.

With the new flag `iree-amdaie-num-cores` this behavior can be adapted,
namely
- `iree-amdaie-num-cores=2`: tile to `(16,128)` and use two AIE tiles
- `iree-amdaie-num-cores=4`: tile to `(16,256)` and use four AIE tiles.

Values different from 1, 2 or 4 result in an error.
This flag is ignored within the PadPipeline and SimplePackPipeline.

---------

Co-authored-by: Franz Haniel <[email protected]>
  • Loading branch information
frafranz and FranzHaniel authored Feb 9, 2024
1 parent 69b6422 commit 831d3d5
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ void AMDAIELoweringStrategyPass::runOnOperation() {
"Expected a variantOp root with an inner ModuleOp");
return signalPassFailure();
}
if (failed(initAIELaunchConfig(moduleOp, usePassPipeline))) {
// To simplify development, the number of cores can be passed as a flag during
// compilation. In the future these parameters could be read from file.
struct AIEConfig cfg = {numCores};
if (failed(initAIELaunchConfig(moduleOp, usePassPipeline, cfg))) {
return signalPassFailure();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ namespace mlir::iree_compiler::AMDAIE {
/// implements the contraction operation interface.
static LogicalResult setRootConfig(func::FuncOp entryPointFn,
linalg::MatmulOp matmulOp,
AIEPassPipeline usePassPipeline) {
AIEPassPipeline usePassPipeline,
AIEConfig cfg) {
assert(!getLoweringConfig(matmulOp) && "expected lowering_config is not set");
auto linalgOp = cast<linalg::LinalgOp>(matmulOp.getOperation());
unsigned numLoops = linalgOp.getNumLoops();
Expand Down Expand Up @@ -55,7 +56,9 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
entryPointFn, matmulOp, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::None);
} else if (usePassPipeline == AIEPassPipeline::PackPipeline) {
SmallVector<int64_t> TileSizeLevel0 = {16, 64};
if (!(cfg.num_cores == 1 || cfg.num_cores == 2 || cfg.num_cores == 4))
return matmulOp.emitOpError("unhandled number of cores");
SmallVector<int64_t> TileSizeLevel0 = {16, 64 * cfg.num_cores};
SmallVector<int64_t> TileSizeLevel1 = {0, 0, 64};
SmallVector<int64_t> TileSizeLevel2 = {1, 1};
TileSizesListType tileSizes = {TileSizeLevel0, TileSizeLevel1,
Expand All @@ -69,14 +72,15 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,

/// Redirects to methods that set the configuration based on operation type.
static LogicalResult setRootConfigImpl(func::FuncOp entryPointFn, Operation *op,
AIEPassPipeline usePassPipeline) {
AIEPassPipeline usePassPipeline,
AIEConfig cfg) {
auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
return TypeSwitch<Operation *, LogicalResult>(op)
// TODO (nmeshram): This is very limited for now, plan is to
// let it first crash for all the other ops and then consiously
// add support for them, this way we can verify our work.
.Case<linalg::MatmulOp>([&](auto op) {
return setRootConfig(entryPointFn, op, usePassPipeline);
return setRootConfig(entryPointFn, op, usePassPipeline, cfg);
})
.Default([&](Operation *op) { return success(); });
};
Expand All @@ -86,7 +90,7 @@ static LogicalResult setRootConfigImpl(func::FuncOp entryPointFn, Operation *op,
/// Sets the translation information to use for a dispatch region.
static LogicalResult setTranslationInfoAndRootConfig(
func::FuncOp entryPointFn, ArrayRef<Operation *> computeOps,
AIEPassPipeline usePassPipeline) {
AIEPassPipeline usePassPipeline, AIEConfig cfg) {
// Make sure that lowering_config is not preset on any compute ops.
for (auto computeOp : computeOps) {
if (getLoweringConfig(computeOp)) return failure();
Expand All @@ -101,7 +105,8 @@ static LogicalResult setTranslationInfoAndRootConfig(
return entryPointFn.emitError("Case with no root ops not yet supported.");
}

if (failed(setRootConfigImpl(entryPointFn, rootOperation, usePassPipeline))) {
if (failed(setRootConfigImpl(entryPointFn, rootOperation, usePassPipeline,
cfg))) {
return failure();
}

Expand All @@ -112,7 +117,8 @@ static LogicalResult setTranslationInfoAndRootConfig(
}

LogicalResult initAIELaunchConfig(ModuleOp moduleOp,
AIEPassPipeline usePassPipeline) {
AIEPassPipeline usePassPipeline,
AIEConfig cfg) {
llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
getAllEntryPoints(moduleOp);
for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
Expand All @@ -127,7 +133,7 @@ LogicalResult initAIELaunchConfig(ModuleOp moduleOp,

SmallVector<Operation *> computeOps = getComputeOps(funcOp);
if (failed(setTranslationInfoAndRootConfig(funcOp, computeOps,
usePassPipeline))) {
usePassPipeline, cfg))) {
return failure();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,15 @@ enum class AIEPassPipeline : int32_t {
None = 3
};

LogicalResult initAIELaunchConfig(ModuleOp moduleOp,
AIEPassPipeline usePassPipeline);
/// Struct specifying the number of cores to use. This will be replaced
/// by a more versatile handling in the future.
struct AIEConfig {
int32_t num_cores;
};

LogicalResult initAIELaunchConfig(ModuleOp moduleOp,
AIEPassPipeline usePassPipeline,
AIEConfig cfg);
} // namespace mlir::iree_compiler::AMDAIE

#endif // IREE_AMD_AIE_TRANSFORMS_KERNELDISPATCH_H_
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ static llvm::cl::opt<AIEPassPipeline> clUsePipeline(
"pack operation")),
llvm::cl::init(AIEPassPipeline::SimplePackPipeline));

static llvm::cl::opt<int32_t> clNumCores(
"iree-amdaie-num-cores",
llvm::cl::desc("Choose the number of cores to use"), llvm::cl::init(1));

//===---------------------------------------------------------------------===//
// Default allocation functions for AIE backend
//===---------------------------------------------------------------------===//
Expand Down Expand Up @@ -271,6 +275,7 @@ void buildAMDAIETransformPassPipeline(OpPassManager &pm) {
{
AMDAIELoweringStrategyOptions options;
options.usePassPipeline = clUsePipeline;
options.numCores = clNumCores;
pm.addPass(createAMDAIELoweringStrategyPass(options));
}
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ def AMDAIELoweringStrategy :
"Use the more advanced pack-based lowering strategy, including peeling and double-buffering."),
clEnumValN(mlir::iree_compiler::AMDAIE::AIEPassPipeline::SimplePackPipeline, "simple-pack",
"Use the simple-pack based lowering strategy.")
)}]>
)}]>,
Option<"numCores", "num-cores", "int32_t", /*default=*/"1",
"Choose the number of cores to use">
];
}

Expand Down
2 changes: 2 additions & 0 deletions tests/samples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ iree_lit_test_suite(
lit
SRCS
"pack_pipeline_funcIR.mlir"
"pack_pipeline_funcIR_2core.mlir"
"pack_pipeline_funcIR_4core.mlir"
"pad_pipeline_e2e.mlir"
"simple_pack_pipeline_e2e.mlir"
TOOLS
Expand Down
48 changes: 48 additions & 0 deletions tests/samples/pack_pipeline_funcIR_2core.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --mlir-print-ir-after=fold-memref-alias-ops %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-use-pipeline=pack --iree-amdaie-num-cores=2 | FileCheck %s

func.func @matmul_example(%lhs: tensor<16x256xi8>, %rhs: tensor<256x256xi8>) -> tensor<16x256xi32>
{
%empty = tensor.empty() : tensor<16x256xi32>
%cst = arith.constant 0 : i32
%fill = linalg.fill ins(%cst : i32) outs(%empty : tensor<16x256xi32>) -> tensor<16x256xi32>
%res = linalg.matmul ins(%lhs, %rhs: tensor<16x256xi8>, tensor<256x256xi8>)
outs(%fill: tensor<16x256xi32>) -> tensor<16x256xi32>
return %res : tensor<16x256xi32>
}

// CHECK-LABEL: @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32
// CHECK: memref.alloc() : memref<1x1x8x4x4x8xi32, 2 : i32>
// CHECK: memref.alloc() : memref<1x1x8x8x8x8xi8, 2 : i32>
// CHECK: memref.alloc() : memref<1x1x8x4x4x8xi8, 2 : i32>
// CHECK: memref.alloc() : memref<1x2x16x64xi32, 1 : i32>
// CHECK: memref.alloc() : memref<1x2x64x64xi8, 1 : i32>
// CHECK: memref.alloc() : memref<1x1x16x64xi8, 1 : i32>
// CHECK: scf.forall
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<64x128xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xi8, 1 : i32>)
// CHECK: scf.forall
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
// CHECK: linalg.fill
// CHECK: linalg.generic
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[2048, 1024, 64, 1], offset: ?>, 1 : i32>)
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x2x16x64xi32, 1 : i32> memref<16x128xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
// CHECK: scf.for
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<64x128xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xi8, 1 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<16x128xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x16x64xi32, 1 : i32>)
// CHECK: scf.forall
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi32, strided<[2048, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi32, 2 : i32>)
// CHECK: linalg.generic
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[2048, 1024, 64, 1], offset: ?>, 1 : i32>)
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x2x16x64xi32, 1 : i32> memref<16x128xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
// CHECK: memref.dealloc %{{.*}} : memref<1x1x16x64xi8, 1 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x2x64x64xi8, 1 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x2x16x64xi32, 1 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi8, 2 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x1x8x8x8x8xi8, 2 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi32, 2 : i32>


48 changes: 48 additions & 0 deletions tests/samples/pack_pipeline_funcIR_4core.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --mlir-print-ir-after=fold-memref-alias-ops %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-use-pipeline=pack --iree-amdaie-num-cores=4 | FileCheck %s

func.func @matmul_example(%lhs: tensor<16x256xi8>, %rhs: tensor<256x256xi8>) -> tensor<16x256xi32>
{
%empty = tensor.empty() : tensor<16x256xi32>
%cst = arith.constant 0 : i32
%fill = linalg.fill ins(%cst : i32) outs(%empty : tensor<16x256xi32>) -> tensor<16x256xi32>
%res = linalg.matmul ins(%lhs, %rhs: tensor<16x256xi8>, tensor<256x256xi8>)
outs(%fill: tensor<16x256xi32>) -> tensor<16x256xi32>
return %res : tensor<16x256xi32>
}

// CHECK-LABEL: @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32
// CHECK: memref.alloc() : memref<1x1x8x4x4x8xi32, 2 : i32>
// CHECK: memref.alloc() : memref<1x1x8x8x8x8xi8, 2 : i32>
// CHECK: memref.alloc() : memref<1x1x8x4x4x8xi8, 2 : i32>
// CHECK: memref.alloc() : memref<1x4x16x64xi32, 1 : i32>
// CHECK: memref.alloc() : memref<1x4x64x64xi8, 1 : i32>
// CHECK: memref.alloc() : memref<1x1x16x64xi8, 1 : i32>
// CHECK: scf.forall
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<64x256xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x64x64xi8, 1 : i32>)
// CHECK: scf.forall
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
// CHECK: linalg.fill
// CHECK: linalg.generic
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1 : i32>)
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x4x16x64xi32, 1 : i32> memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
// CHECK: scf.for
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<16x64xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, 1 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<64x256xi8, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x64x64xi8, 1 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x4x16x64xi32, 1 : i32>)
// CHECK: scf.forall
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi8, 2 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x64x64xi8, strided<[16384, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x8x8x8xi8, 2 : i32>)
// CHECK: iree_linalg_ext.pack %{{.*}} : (memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x4x8xi32, 2 : i32>)
// CHECK: linalg.generic
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x1x8x4x4x8xi32, 2 : i32> memref<1x1x16x64xi32, strided<[4096, 1024, 64, 1], offset: ?>, 1 : i32>)
// CHECK: iree_linalg_ext.unpack %{{.*}} : (memref<1x4x16x64xi32, 1 : i32> memref<16x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
// CHECK: memref.dealloc %{{.*}} : memref<1x1x16x64xi8, 1 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x4x64x64xi8, 1 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x4x16x64xi32, 1 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi8, 2 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x1x8x8x8x8xi8, 2 : i32>
// CHECK: memref.dealloc %{{.*}} : memref<1x1x8x4x4x8xi32, 2 : i32>


0 comments on commit 831d3d5

Please sign in to comment.