diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index ec54b67a0..15bcb682d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -15,6 +15,7 @@ #include "air/Dialect/AIR/AIRDialect.h" #include "air/Dialect/AIRRt/AIRRtDialect.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" #include "iree/compiler/Utils/FlatbufferUtils.h" #include "llvm/Bitcode/BitcodeWriter.h" @@ -73,6 +74,7 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAddLoweringStrategy.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAddLoweringStrategy.cpp index 0fc4a5020..07d437ede 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAddLoweringStrategy.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAddLoweringStrategy.cpp @@ -18,9 +18,8 @@ namespace mlir::iree_compiler::AMDAIE { namespace { /// Add the lowering strategy configurations to be used for ops. -class AMDAIEAddLoweringStrategyPass - : public impl::AMDAIEAddLoweringStrategyBase< - AMDAIEAddLoweringStrategyPass> { +class AMDAIELoweringStrategyPass + : public impl::AMDAIELoweringStrategyBase { public: void getDependentDialects(DialectRegistry ®istry) const override { registry.insert< @@ -31,16 +30,16 @@ class AMDAIEAddLoweringStrategyPass vector::VectorDialect>(); } - AMDAIEAddLoweringStrategyPass() = default; - AMDAIEAddLoweringStrategyPass(const AMDAIEAddLoweringStrategyOptions &options) - : AMDAIEAddLoweringStrategyBase(options) {} - AMDAIEAddLoweringStrategyPass(const AMDAIEAddLoweringStrategyPass &pass){}; + AMDAIELoweringStrategyPass() = default; + AMDAIELoweringStrategyPass(const AMDAIELoweringStrategyOptions &options) + : AMDAIELoweringStrategyBase(options) {} + AMDAIELoweringStrategyPass(const AMDAIELoweringStrategyPass &pass){}; void runOnOperation() override; }; } // namespace -void AMDAIEAddLoweringStrategyPass::runOnOperation() { +void AMDAIELoweringStrategyPass::runOnOperation() { IREE::HAL::ExecutableVariantOp variantOp = getOperation(); ModuleOp moduleOp = variantOp.getInnerModule(); if (!moduleOp) { @@ -48,14 +47,14 @@ void AMDAIEAddLoweringStrategyPass::runOnOperation() { "Expected a variantOp root with an inner ModuleOp"); return signalPassFailure(); } - if (failed(initAIELaunchConfig(moduleOp, useUKernelStrategy))) { + if (failed(initAIELaunchConfig(moduleOp, usePassPipeline))) { return signalPassFailure(); } } -std::unique_ptr createAMDAIEAddLoweringStrategyPass( - AMDAIEAddLoweringStrategyOptions options) { - return std::make_unique(options); +std::unique_ptr createAMDAIELoweringStrategyPass( + AMDAIELoweringStrategyOptions options) { + return std::make_unique(options); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp index 1d06a20f7..474a30ea5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp @@ -34,6 +34,7 @@ using mlir::iree_compiler::IREE::Codegen::LoweringConfigAttr; namespace mlir::iree_compiler::AMDAIE { namespace { + /// Lowers an hal.executable.variant operation to scalar/native-vector /// code. Invokes different compilation pipeline to /// - first lower to scalar/native-vector code @@ -56,6 +57,9 @@ class AMDAIELowerExecutableTargetPass AMDAIELowerExecutableTargetPass() = default; AMDAIELowerExecutableTargetPass( const AMDAIELowerExecutableTargetPass &pass){}; + AMDAIELowerExecutableTargetPass( + const AMDAIELowerExecutableTargetOptions &options) + : AMDAIELowerExecutableTargetBase(options) {} void runOnOperation() override; }; @@ -138,12 +142,17 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() { case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen: addTransformDialectPasses(executableLoweringPipeline); break; - // TODO(avarma): Currently we are using "CPUDefault" but resorting to use - // the default case. Will soon have corresponding AIE enum. - default: + case IREE::Codegen::DispatchLoweringPassPipeline::None: { TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp); - addPadBasedPassPipeline(executableLoweringPipeline, tilingConfig); - break; + if (usePassPipeline == AIEPassPipeline::SimplePackPipeline) { + addPackBasedPassPipeline(executableLoweringPipeline, tilingConfig); + } else if (usePassPipeline == AIEPassPipeline::PadPipeline) { + addPadBasedPassPipeline(executableLoweringPipeline, tilingConfig); + } + } break; + default: + variantOp.emitOpError("unhandled pass pipeline value set"); + return signalPassFailure(); } } @@ -152,8 +161,9 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() { } } -std::unique_ptr createAMDAIELowerExecutableTargetPass() { - return std::make_unique(); +std::unique_ptr createAMDAIELowerExecutableTargetPass( + AMDAIELowerExecutableTargetOptions options) { + return std::make_unique(options); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp index 5d4d81ded..28c33348e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/Transforms/Passes.h" +#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Pass/Pass.h" @@ -30,11 +31,11 @@ struct PackConfig { static FailureOr getPackConfig(RewriterBase &rewriter, int packLevel) { PackConfig config; - if (packLevel == 1) { + if (packLevel == 0) { // packed size for [M, N, K] - config.packedSizes = {rewriter.getI64IntegerAttr(16), - rewriter.getI64IntegerAttr(64), - rewriter.getI64IntegerAttr(64)}; + config.packedSizes = {rewriter.getI64IntegerAttr(8), + rewriter.getI64IntegerAttr(16), + rewriter.getI64IntegerAttr(16)}; // Transpose B matrix from [K N n k] to [K N k n] config.transposePackIndices = {1}; // There is no corresponding unpack for the specified pack operation @@ -42,7 +43,7 @@ static FailureOr getPackConfig(RewriterBase &rewriter, config.unpackEmpty = {0}; config.innerPerm = {{1, 0}}; config.outerPerm = {{0, 1}}; - } else if (packLevel == 2) { + } else if (packLevel == 1) { // packed size for [M, N, K, m, n, k] config.packedSizes = { rewriter.getI64IntegerAttr(0), rewriter.getI64IntegerAttr(0), @@ -163,6 +164,12 @@ void AMDAIEPackAndTransposePass::runOnOperation() { // Update packed linalg op packedOp = packTransResult->transposedLinalgOp; } + + // Get the lowering config from the previous linalgOp and add it to the + // packedOp + if (auto config = getLoweringConfig(linalgOp)) { + setLoweringConfig(packedOp, config); + } } } // namespace diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index d06ed89ea..4cfac19f1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -63,6 +63,11 @@ static bool isTilingReductionDimension(TilingInterface consumerOp, return false; } +static bool consumerToSkip(TilingInterface op) { + if (isa(op) || isa(op)) return true; + return false; +} + LogicalResult applyTileAndFuse(RewriterBase &rewriter, TilingInterface rootOp, DominanceInfo &dominanceInfo, scf::SCFTileAndFuseOptions &tileAndFuseOptions) { @@ -106,9 +111,9 @@ void AMDAIETileAndFusePass::runOnOperation() { TilingInterface consumerOp; funcOp->walk([&](TilingInterface op) { - // Find the next consumer op if it does not have loops OR if it is a - // linalg.copy op. - if (op.getLoopIteratorTypes().empty() || isa(op)) + // Find the next consumer op if it does not have loops OR it is from + // the skip ops list which currently contains linalg.copy and tensor.unpack. + if (op.getLoopIteratorTypes().empty() || consumerToSkip(op)) return WalkResult::advance(); consumerOp = op; return WalkResult::interrupt(); @@ -146,7 +151,7 @@ void AMDAIETileAndFusePass::runOnOperation() { getAsIndexOpFoldResult(context, tileSizesVal); auto options = scf::SCFTilingOptions().setTileSizes(tileSizes); // When tiling using scf.for we do not need to set any mapping. - if (tilingLevel != 2) { + if (!useSCFFor) { options.setMapping( {gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY), gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimX)}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp index e3b3035be..8a252cc35 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp @@ -19,7 +19,7 @@ namespace mlir::iree_compiler::AMDAIE { /// implements the contraction operation interface. static LogicalResult setRootConfig(func::FuncOp entryPointFn, linalg::MatmulOp matmulOp, - bool useUKernelStrategy) { + AIEPassPipeline usePassPipeline) { assert(!getLoweringConfig(matmulOp) && "expected lowering_config is not set"); auto linalgOp = cast(matmulOp.getOperation()); unsigned numLoops = linalgOp.getNumLoops(); @@ -35,34 +35,47 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn, // TODO (nmeshram) : This needs to be moved in a separate more generalized // logic. Also, need a flag to experiment between pad based and pack based // approach which will have different tile sizes and pass pipelines - TileSizesListType tileSizes; - if (useUKernelStrategy) { - SmallVector TileSizeLevel0 = {16, 64}; - SmallVector TileSizeLevel1 = {0, 0, 64}; - SmallVector TileSizeLevel2 = {1, 1}; - tileSizes = {TileSizeLevel0, TileSizeLevel1, TileSizeLevel2}; - } else { + if (usePassPipeline == AIEPassPipeline::PadPipeline) { SmallVector TileSizeLevel0 = {8, 8}; SmallVector TileSizeLevel1 = {4, 4}; SmallVector TileSizeLevel2 = {0, 0, 4}; - tileSizes = {TileSizeLevel0, TileSizeLevel1, TileSizeLevel2}; + TileSizesListType tileSizes = {TileSizeLevel0, TileSizeLevel1, + TileSizeLevel2}; + return setOpConfigAndEntryPointFnTranslation( + entryPointFn, matmulOp, tileSizes, + IREE::Codegen::DispatchLoweringPassPipeline::None); + } else if (usePassPipeline == AIEPassPipeline::SimplePackPipeline) { + SmallVector TileSizeLevel0 = {8, 16}; + SmallVector TileSizeLevel1 = {1, 1}; + SmallVector TileSizeLevel2 = {0, 0, 1}; + TileSizesListType tileSizes = {TileSizeLevel0, TileSizeLevel1, + TileSizeLevel2}; + return setOpConfigAndEntryPointFnTranslation( + entryPointFn, matmulOp, tileSizes, + IREE::Codegen::DispatchLoweringPassPipeline::None); + } else if (usePassPipeline == AIEPassPipeline::PackPipeline) { + SmallVector TileSizeLevel0 = {16, 64}; + SmallVector TileSizeLevel1 = {0, 0, 64}; + SmallVector TileSizeLevel2 = {1, 1}; + TileSizesListType tileSizes = {TileSizeLevel0, TileSizeLevel1, + TileSizeLevel2}; + return setOpConfigAndEntryPointFnTranslation( + entryPointFn, matmulOp, tileSizes, + IREE::Codegen::DispatchLoweringPassPipeline::None); } - - return setOpConfigAndEntryPointFnTranslation( - entryPointFn, matmulOp, tileSizes, - IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault); + return matmulOp.emitOpError("unhandled pass pipeline"); } /// Redirects to methods that set the configuration based on operation type. static LogicalResult setRootConfigImpl(func::FuncOp entryPointFn, Operation *op, - bool useUKernelStrategy) { + AIEPassPipeline usePassPipeline) { auto setRootConfigFn = [&](Operation *op) -> LogicalResult { return TypeSwitch(op) // TODO (nmeshram): This is very limited for now, plan is to // let it first crash for all the other ops and then consiously // add support for them, this way we can verify our work. .Case([&](auto op) { - return setRootConfig(entryPointFn, op, useUKernelStrategy); + return setRootConfig(entryPointFn, op, usePassPipeline); }) .Default([&](Operation *op) { return success(); }); }; @@ -72,7 +85,7 @@ static LogicalResult setRootConfigImpl(func::FuncOp entryPointFn, Operation *op, /// Sets the translation information to use for a dispatch region. static LogicalResult setTranslationInfoAndRootConfig( func::FuncOp entryPointFn, ArrayRef computeOps, - bool useUKernelStrategy) { + AIEPassPipeline usePassPipeline) { // Make sure that lowering_config is not preset on any compute ops. for (auto computeOp : computeOps) { if (getLoweringConfig(computeOp)) return failure(); @@ -87,8 +100,7 @@ static LogicalResult setTranslationInfoAndRootConfig( return entryPointFn.emitError("Case with no root ops not yet supported."); } - if (failed( - setRootConfigImpl(entryPointFn, rootOperation, useUKernelStrategy))) { + if (failed(setRootConfigImpl(entryPointFn, rootOperation, usePassPipeline))) { return failure(); } @@ -98,7 +110,8 @@ static LogicalResult setTranslationInfoAndRootConfig( return success(); } -LogicalResult initAIELaunchConfig(ModuleOp moduleOp, bool useUKernelStrategy) { +LogicalResult initAIELaunchConfig(ModuleOp moduleOp, + AIEPassPipeline usePassPipeline) { llvm::StringMap exportOps = getAllEntryPoints(moduleOp); for (auto funcOp : moduleOp.getOps()) { @@ -113,7 +126,7 @@ LogicalResult initAIELaunchConfig(ModuleOp moduleOp, bool useUKernelStrategy) { SmallVector computeOps = getComputeOps(funcOp); if (failed(setTranslationInfoAndRootConfig(funcOp, computeOps, - useUKernelStrategy))) { + usePassPipeline))) { return failure(); } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h index 0d40ed0d8..f71faf089 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h @@ -12,8 +12,18 @@ namespace mlir::iree_compiler::AMDAIE { +/// Enum for pass pipelines to pick. Because of how the pass-pipeline +/// enums are implemented using tablegen in IREE, it isnt extensible. +/// This is an enum to pick different pass pipelines in IREE. +enum class AIEPassPipeline : int32_t { + PadPipeline = 0, + PackPipeline = 1, + SimplePackPipeline = 2, + None = 3 +}; + LogicalResult initAIELaunchConfig(ModuleOp moduleOp, - bool useUKernelStrategy = false); + AIEPassPipeline usePassPipeline); } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 892626d76..ffa88fa2c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -1,4 +1,5 @@ -// Copyright 2023 The IREE Authors +// Copyright 2023 The IREE Authors#include "irer" + // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,6 +8,7 @@ #ifndef IREE_AMD_AIE_TRANSFORMS_PASSDETAIL_H_ #define IREE_AMD_AIE_TRANSFORMS_PASSDETAIL_H_ +#include "iree-amd-aie/Transforms/KernelDispatch.h" #include "iree/compiler/Dialect/HAL/IR/HALOps.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -22,7 +24,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECLEANUP #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL #define GEN_PASS_DEF_AMDAIELOWEREXECUTABLETARGET -#define GEN_PASS_DEF_AMDAIEADDLOWERINGSTRATEGY +#define GEN_PASS_DEF_AMDAIELOWERINGSTRATEGY #define GEN_PASS_DEF_AMDAIELOWERWORKGROUPCOUNT #define GEN_PASS_DEF_AMDAIEPACKANDTRANSPOSE #define GEN_PASS_DEF_AMDAIEPAD diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index e03aee37c..70c69f599 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -22,6 +22,22 @@ namespace mlir::iree_compiler::AMDAIE { +/// Command line options used purely for development purposes. Not to be relied +/// on in any way. +static llvm::cl::opt clUsePipeline( + "iree-amdaie-use-pipeline", + llvm::cl::desc("Pick the lowering pipeline to use"), + llvm::cl::values( + clEnumValN(AIEPassPipeline::PadPipeline, "pad", + "Use IREE lowering to AIR dialect through pad operations"), + clEnumValN( + AIEPassPipeline::PackPipeline, "pack", + "Use the IREE lowering to AIR dialect through pack operation"), + clEnumValN(AIEPassPipeline::SimplePackPipeline, "simple-pack", + "Use the simplified IREE lowering to AIR dialect through " + "pack operation")), + llvm::cl::init(AIEPassPipeline::SimplePackPipeline)); + //===---------------------------------------------------------------------===// // Default allocation functions for AIE backend //===---------------------------------------------------------------------===// @@ -103,11 +119,84 @@ void addPadBasedPassPipeline(OpPassManager &pm, TilingConfig &tilingConfig) { pm.addPass(createCSEPass()); } +void addPackBasedPassPipeline(OpPassManager &pm, TilingConfig &tilingConfig) { + auto &modulePassManager = pm.nest(); + modulePassManager.addNestedPass(createAMDAIECleanupPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + + AMDAIETileAndFuseOptions tileOptions; + AMDAIEPackAndTransposeOptions packOptions; + AMDAIEBufferizeToAllocationOptions bufferizeOptions; + + // First level tiling using scf.forall + tileOptions.tilingLevel = 0; + tileOptions.useSCFFor = false; + modulePassManager.addNestedPass( + createAMDAIETileAndFusePass(tileOptions)); + modulePassManager.addNestedPass(createAMDAIECleanupPass()); + modulePassManager.addPass(createCanonicalizerPass()); + modulePassManager.addPass(createCSEPass()); + + // First level packing and bufferize to allocation + packOptions.packLevel = 0; + modulePassManager.addNestedPass( + createAMDAIEPackAndTransposePass(packOptions)); + bufferizeOptions.memorySpace = 1; + bufferizeOptions.bufferizeLevel = -1; + modulePassManager.addNestedPass( + createAMDAIEBufferizeToAllocationPass(bufferizeOptions)); + + // Second level tiling using scf.forall + tileOptions.tilingLevel = 1; + tileOptions.useSCFFor = false; + modulePassManager.addNestedPass( + createAMDAIETileAndFusePass(tileOptions)); + modulePassManager.addNestedPass(createAMDAIECleanupPass()); + modulePassManager.addPass(createCanonicalizerPass()); + modulePassManager.addPass(createCSEPass()); + + // Fuse fill into forall loop + modulePassManager.addNestedPass( + createAMDAIEFuseFillIntoForallPass()); + modulePassManager.addNestedPass(createAMDAIECleanupPass()); + modulePassManager.addPass(createCanonicalizerPass()); + modulePassManager.addPass(createCSEPass()); + + // Second level packing and bufferize to allocation + packOptions.packLevel = 1; + modulePassManager.addNestedPass( + createAMDAIEPackAndTransposePass(packOptions)); + bufferizeOptions.memorySpace = 2; + bufferizeOptions.bufferizeLevel = -1; + modulePassManager.addNestedPass( + createAMDAIEBufferizeToAllocationPass(bufferizeOptions)); + + // Tile the reduction loops + tileOptions.tilingLevel = 2; + tileOptions.useSCFFor = true; + modulePassManager.addNestedPass( + createAMDAIETileAndFusePass(tileOptions)); + modulePassManager.addNestedPass(createAMDAIECleanupPass()); + modulePassManager.addPass(createCanonicalizerPass()); + modulePassManager.addPass(createCSEPass()); + + // Comprehensive bufferization + addAMDAIEBufferizePasses(modulePassManager); +} + void buildAMDAIETransformPassPipeline(OpPassManager &pm) { addCommonTargetExecutablePreprocessingPasses(pm); - pm.addPass(createEraseHALDescriptorTypeFromMemRefPass()); - pm.addPass(createAMDAIEAddLoweringStrategyPass()); - pm.addPass(createAMDAIELowerExecutableTargetPass()); + { + AMDAIELoweringStrategyOptions options; + options.usePassPipeline = clUsePipeline; + pm.addPass(createAMDAIELoweringStrategyPass(options)); + } + { + AMDAIELowerExecutableTargetOptions options; + options.usePassPipeline = clUsePipeline; + pm.addPass(createAMDAIELowerExecutableTargetPass(options)); + } pm.addPass(createAMDAIELowerWorkgroupCountPass()); auto &modulePassManager = pm.nest(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 59b906dfe..1f9ecdb1f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -35,6 +35,10 @@ void buildAMDAIETransformPassPipeline(OpPassManager &pm); void addPadBasedPassPipeline(OpPassManager &passManager, TilingConfig &tilingConfig); +/// Populates passes needed to lower the IR via a Pack based approach. +void addPackBasedPassPipeline(OpPassManager &passManager, + TilingConfig &tilingConfig); + /// Create a pass to do some rewrites that help bridging the path to AIR/AIE /// lowering. std::unique_ptr createAMDAIEBridgeToAIRPass(); @@ -54,12 +58,13 @@ std::unique_ptr createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass(); /// Create a pass to fuse the linalg.fill into the forall loops. std::unique_ptr createAMDAIEFuseFillIntoForallPass(); -/// Create pass for adding lowering strategy configurations. -std::unique_ptr createAMDAIEAddLoweringStrategyPass( - AMDAIEAddLoweringStrategyOptions options = {}); - /// Create pass calling the dynamic pipeline for AMDAIE. -std::unique_ptr createAMDAIELowerExecutableTargetPass(); +std::unique_ptr createAMDAIELowerExecutableTargetPass( + AMDAIELowerExecutableTargetOptions options = {}); + +/// Create pass for adding lowering strategy configurations. +std::unique_ptr createAMDAIELoweringStrategyPass( + AMDAIELoweringStrategyOptions options = {}); /// Create a pass to lower workgroup count region of entry point operations. std::unique_ptr createAMDAIELowerWorkgroupCountPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index bf99a5ecf..9bfe89df6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -58,20 +58,28 @@ def AMDAIEFuseFillIntoForall : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEFuseFillIntoForallPass()"; } -def AMDAIEAddLoweringStrategy : - Pass<"iree-amdaie-add-lowering-strategy", "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> { - let summary = "Add lowering strategy configurations to be used"; - let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAddLoweringStrategyPass()"; - let options = [ - Option<"useUKernelStrategy", "use-ukernel-strategy", "bool", /*default=*/"false", - "Whether to use the ukernel tiling strategy"> - ]; -} - def AMDAIELowerExecutableTarget : Pass<"iree-amdaie-lower-executable-target", "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> { let summary = "Perform lowering of executable target using one of the IREE::HAL::DispatchLoweringPassPipeline"; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELowerExecutableTargetPass()"; + let options = [ + Option<"usePassPipeline", "use-pass-pipeline", + "mlir::iree_compiler::AMDAIE::AIEPassPipeline", + /*default=*/"mlir::iree_compiler::AMDAIE::AIEPassPipeline::PadPipeline", + "Pass pipeline to use while lowering to AIR dialect"> + ]; +} + +def AMDAIELoweringStrategy : + Pass<"iree-amdaie-lowering-strategy", "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> { + let summary = "Add lowering strategy configurations to be used"; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELoweringStrategyPass()"; + let options = [ + Option<"usePassPipeline", "use-pass-pipeline", + "mlir::iree_compiler::AMDAIE::AIEPassPipeline", + /*default=*/"mlir::iree_compiler::AMDAIE::AIEPassPipeline::PadPipeline", + "Pass pipeline to use while lowering to AIR dialect"> + ]; } def AMDAIELowerWorkgroupCount : @@ -86,7 +94,7 @@ def AMDAIEPackAndTranspose : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPackAndTransposePass()"; let options = [ - Option<"packLevel", "pack-level", "int64_t", /*default=*/"1", + Option<"packLevel", "pack-level", "int64_t", /*default=*/"-1", "Set the packing level number"> ]; } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir index 42e14ab6a..01888c513 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir @@ -1,13 +1,13 @@ -// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-pack-and-transpose{pack-level=1}))' --split-input-file %s | FileCheck --check-prefix=CHECK-1 %s +// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-pack-and-transpose{pack-level=0}))' --split-input-file %s | FileCheck %s func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0 : tensor<16x256xi8>, %arg1 : tensor<256x256xi8>) -> tensor<16x256xi32> { %c0 = arith.constant 0 : index %c0_i32 = arith.constant 0 : i32 %5 = tensor.empty() : tensor<16x256xi32> %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<16x256xi32>) -> tensor<16x256xi32> - // CHECK-1: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %{{.*}} : tensor<16x256xi8> -> tensor<1x4x16x64xi8> - // CHECK-1: tensor.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %{{.*}} : tensor<256x256xi8> -> tensor<4x4x64x64xi8> - // CHECK-1: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %{{.*}} : tensor<16x256xi32> -> tensor<1x4x16x64xi32> + // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %{{.*}} : tensor<16x256xi8> -> tensor<2x16x8x16xi8> + // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %{{.*}} : tensor<256x256xi8> -> tensor<16x16x16x16xi8> + // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %{{.*}} : tensor<16x256xi32> -> tensor<2x16x8x16xi32> %7 = linalg.matmul ins(%arg0, %arg1 : tensor<16x256xi8>, tensor<256x256xi8>) outs(%6 : tensor<16x256xi32>) -> tensor<16x256xi32> return %7 : tensor<16x256xi32> } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir index 99fa2a64b..7f6fef3dd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-pack-and-transpose{pack-level=2}))' --split-input-file %s | FileCheck --check-prefix=CHECK-2 %s +// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-pack-and-transpose{pack-level=1}))' --split-input-file %s | FileCheck %s #map = affine_map<(d0) -> (d0 * 16)> #map1 = affine_map<(d0) -> (d0 * 64)> @@ -25,9 +25,9 @@ func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0: tensor<1 %extracted_slice_4 = tensor.extract_slice %pack_2[0, %arg4, 0, 0] [4, 1, 64, 64] [1, 1, 1, 1] : tensor<4x1x64x64xi8> to tensor<4x1x64x64xi8> %extracted_slice_5 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0] [1, 1, 16, 64] [1, 1, 1, 1] : tensor<1x1x16x64xi32> to tensor<1x1x16x64xi32> %13 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_5 : tensor<1x1x16x64xi32>) -> tensor<1x1x16x64xi32> - // CHECK-2: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x4x16x64xi8> -> tensor<1x4x8x4x4x8xi8> - // CHECK-2: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<4x1x64x64xi8> -> tensor<4x1x8x8x8x8xi8> - // CHECK-2: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x16x64xi32> -> tensor<1x1x8x4x4x8xi32> + // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x4x16x64xi8> -> tensor<1x4x8x4x4x8xi8> + // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<4x1x64x64xi8> -> tensor<4x1x8x8x8x8xi8> + // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x16x64xi32> -> tensor<1x1x8x4x4x8xi32> %14 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x4x16x64xi8>, tensor<4x1x64x64xi8>) outs(%13 : tensor<1x1x16x64xi32>) { ^bb0(%in: i8, %in_6: i8, %out: i32): %15 = arith.extsi %in : i8 to i32 diff --git a/tests/samples/CMakeLists.txt b/tests/samples/CMakeLists.txt index 18268c68c..09d0d7b52 100644 --- a/tests/samples/CMakeLists.txt +++ b/tests/samples/CMakeLists.txt @@ -8,8 +8,8 @@ iree_lit_test_suite( NAME lit SRCS - "matmul_fill_static_i32.mlir" - "matmul_fill_static_i32_config.mlir" + "pack_pipeline_e2e.mlir" + "pad_pipeline_e2e.mlir" TOOLS ${IREE_LLD_TARGET} FileCheck diff --git a/tests/samples/matmul_fill_static_i32_config.mlir b/tests/samples/matmul_fill_static_i32_config.mlir deleted file mode 100644 index 84669436b..000000000 --- a/tests/samples/matmul_fill_static_i32_config.mlir +++ /dev/null @@ -1,41 +0,0 @@ -// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" | FileCheck %s - -// This test case is just to demonstrate that TransformDialectCodegen is not being set -// and the e2e still works. -// CHECK-LABEL: hal.executable.export public @matmul_static_dispatch_0_matmul_8x8x16_i32 -// CHECK: aie.device(ipu) -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: func.func @matmul_static_dispatch_0_matmul_8x8x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<16x8xi32>, %arg2: memref<8x8xi32>) -// CHECK: aiex.ipu.dma_memcpy_nd -// CHECK: aiex.ipu.dma_memcpy_nd -// CHECK: aiex.ipu.dma_memcpy_nd -// CHECK: aiex.ipu.sync -#config = #iree_codegen.lowering_config -#translation = #iree_codegen.translation_info -hal.executable private @matmul_static { - hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd"}>) { - hal.executable.export public @matmul_static_dispatch_0_matmul_8x8x16_i32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #translation} { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @matmul_static_dispatch_0_matmul_8x8x16_i32() { - %c0 = arith.constant 0 : index - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x16xi32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16x8xi32> - %5 = tensor.empty() : tensor<8x8xi32> - %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<8x8xi32>) -> tensor<8x8xi32> - %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<8x16xi32>, tensor<16x8xi32>) outs(%6 : tensor<8x8xi32>) -> tensor<8x8xi32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [8, 8], strides = [1, 1] : tensor<8x8xi32> -> !flow.dispatch.tensor> - return - } - } - } -} diff --git a/tests/samples/pack_pipeline_e2e.mlir b/tests/samples/pack_pipeline_e2e.mlir new file mode 100644 index 000000000..6000db124 --- /dev/null +++ b/tests/samples/pack_pipeline_e2e.mlir @@ -0,0 +1,24 @@ +// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" | FileCheck %s --check-prefix=CPP + +// This test demonstrates Pack pipeline based e2e lowering. + +// To check the cpp path equivalent to the transform dialect script. +// CPP-LABEL: hal.executable.export public @matmul_static_dispatch_0_matmul_8x32x16_i32 +// CPP: aie.device(ipu) +// CPP: aie.shim_dma_allocation +// CPP: aie.shim_dma_allocation +// CPP: aie.shim_dma_allocation +// CPP: func.func @matmul_static_dispatch_0_matmul_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<16x32xi32>, %arg2: memref<8x32xi32>) +// CPP: aiex.ipu.dma_memcpy_nd +// CPP: aiex.ipu.dma_memcpy_nd +// CPP: aiex.ipu.dma_memcpy_nd +// CPP: aiex.ipu.sync +func.func @matmul_static(%lhs : tensor<8x16xi32>, + %rhs : tensor<16x32xi32>) -> tensor<8x32xi32> { + %empty = tensor.empty() : tensor<8x32xi32> + %cst = arith.constant 0 : i32 + %fill = linalg.fill ins(%cst : i32) outs(%empty : tensor<8x32xi32>) -> tensor<8x32xi32> + %2 = linalg.matmul ins(%lhs, %rhs : tensor<8x16xi32>, tensor<16x32xi32>) + outs(%fill : tensor<8x32xi32>) -> tensor<8x32xi32> + return %2 : tensor<8x32xi32> +} diff --git a/tests/samples/matmul_fill_static_i32.mlir b/tests/samples/pad_pipeline_e2e.mlir similarity index 92% rename from tests/samples/matmul_fill_static_i32.mlir rename to tests/samples/pad_pipeline_e2e.mlir index bf669807e..99c5de6a4 100644 --- a/tests/samples/matmul_fill_static_i32.mlir +++ b/tests/samples/pad_pipeline_e2e.mlir @@ -1,5 +1,7 @@ // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-codegen-transform-dialect-library=%S/matmul_fill_spec_pad.mlir | FileCheck %s --check-prefix=TRANSFORM -// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" | FileCheck %s --check-prefix=CPP +// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-use-pipeline=pad | FileCheck %s --check-prefix=CPP + +// This test demonstrates Pad pipeline based e2e lowering. // To check the transform dialect script path. // TRANSFORM-LABEL: hal.executable.export public @matmul_static_dispatch_0_matmul_8x8x16_i32