From 039ba234b33e3cb34442b220985d32c71b849949 Mon Sep 17 00:00:00 2001 From: James Newling Date: Thu, 10 Oct 2024 15:40:25 -0700 Subject: [PATCH] Move most of LLVM lowering out of aie2xclbin (#838) Is a continuation of https://github.com/nod-ai/iree-amd-aie/commit/be42d1918f43ed04595a1384cd9274305e42f961 -- it that PR we moved vector->aievec into the main pipeline, this PR moves aievec->llvm into the main pipeline (and a few other dialect lowerings to llvm). Moving the lowering of func to llvm is tricky, because of the position of the pass `AMDAIECoreToStandardFunc` which lowers from CoreOp to func. TBC. --- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp | 73 ++++++--------- .../Transforms/AMDAIELoadAlignmentReset.cpp | 72 +++++++++++++++ .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../iree-amd-aie/Transforms/Passes.cpp | 88 +++++++++---------- .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 6 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 11 +++ .../test/disable_vectorization.mlir | 8 +- 8 files changed, 162 insertions(+), 98 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index ddbbad54c..13024aa11 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -13,9 +13,10 @@ #include #include "AMDAIETargets.h" -#include "iree-amd-aie/Transforms/Passes.h" +#include "aie/Passes.h" +#include "air/Conversion/AIRToAIEPass.h" +#include "iree-dialects/Dialect/LinalgTransform/Passes.h" #include "iree/compiler/Utils/ToolUtils.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" @@ -24,6 +25,15 @@ #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/ToolOutputFile.h" +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" +#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" +#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" #include "mlir/IR/AsmState.h" @@ -33,6 +43,7 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Transforms/Passes.h" #define DEBUG_TYPE "amdaie-xclbingen" @@ -942,47 +953,20 @@ static LogicalResult generateXCLBin( return runTool(xclbinutilBin.value().string(), flags, verbose); } -// A pass which removes the alignment attribute from llvm load operations, if -// the alignment is less than 4 (2 or 1). -// -// Example replaces: -// -// ``` -// %113 = llvm.load %112 {alignment = 2 : i64} : !llvm.ptr -> vector<32xbf16> -// ``` -// -// with -// -// ``` -// %113 = llvm.load %112 : !llvm.ptr -> vector<32xbf16> -// ``` -// -// If this pass is not included in the pipeline, there is an alignment error -// later in the compilation. This is a temporary workaround while a better -// solution is found: propagation of memref.assume_alignment is one option. See -// also https://jira.xilinx.com/projects/AIECC/issues/AIECC-589 -namespace { -struct RemoveAlignment2FromLLVMLoadPass - : PassWrapper> { - void runOnOperation() override { - getOperation().walk([](Operation *op) { - if (auto loadOp = dyn_cast(op)) { - auto alignmentAttr = loadOp.getAlignmentAttr(); - if (alignmentAttr) { - int alignmentVal = alignmentAttr.getValue().getSExtValue(); - if (alignmentVal == 2 || alignmentVal == 1) { - loadOp.setAlignment(std::optional()); - } - } - } - }); - } - - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( - RemoveAlignment2FromLLVMLoadPass); -}; - -} // namespace +void addLowerToLLVMPasses(OpPassManager &pm) { + using namespace mlir; + pm.addPass(createFinalizeMemRefToLLVMConversionPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + ConvertFuncToLLVMPassOptions opts; + opts.useBarePtrCallConv = true; + pm.addPass(createConvertFuncToLLVMPass(opts)); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + pm.addPass(createConvertControlFlowToLLVMPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); +} static LogicalResult generateUnifiedObject( MLIRContext *context, AIE::DeviceOp deviceOp, const std::string &outputFile, @@ -998,8 +982,7 @@ static LogicalResult generateUnifiedObject( printIRModuleScope, timing); pm.addPass(mlir::iree_compiler::AMDAIE::createAMDAIECoreToStandardPass()); - mlir::iree_compiler::AMDAIE::addLowerToLLVMPasses(pm); - pm.addPass(std::make_unique()); + addLowerToLLVMPasses(pm); if (verbose) { llvm::outs() << "\nRunning: "; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp new file mode 100644 index 000000000..8f39e0aed --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp @@ -0,0 +1,72 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/Dialect/SCF/Utils/Utils.h" +#include "mlir/Pass/Pass.h" +#define DEBUG_TYPE "iree-amdaie-acquire-release-to-use-lock" + +namespace mlir::iree_compiler::AMDAIE { + + using namespace mlir; + +namespace { + +// A pass which removes the alignment attribute from llvm load operations, +// if the alignment is less than 4 (2 or 1). +// +// Example. The pass replaces: +// +// ``` +// %113 = llvm.load %112 {alignment = 2 : i64} +// : !llvm.ptr -> vector<32xbf16> +// ``` +// +// with +// +// ``` +// %113 = llvm.load %112 +// : !llvm.ptr -> vector<32xbf16> +// ``` +// +// If this pass is not included in the matmul pipeline, there is an OOM error +// later in the compilation. This is a temporary workaround while a better +// solution is found: propagation of memref.assume_alignment is one option. +// See also https://jira.xilinx.com/projects/AIECC/issues/AIECC-589 + +class AMDAIELoadAlignmentReset + : public impl::AMDAIELoadAlignmentResetBase< + AMDAIELoadAlignmentReset> { + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + getOperation()->walk([](Operation *op) { + if (auto loadOp = dyn_cast(op)) { + auto alignmentAttr = loadOp.getAlignmentAttr(); + if (alignmentAttr) { + int alignmentVal = alignmentAttr.getValue().getSExtValue(); + if (alignmentVal == 2 || alignmentVal == 1) { + loadOp.setAlignment(std::optional()); + } + } + } + }); + } +}; + +} // namespace + +std::unique_ptr createAMDAIELoadAlignmentResetPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 1c91b4d90..d2a7cb0e2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -78,6 +78,7 @@ iree_cc_library( "AMDAIEInsertLoopsForVectorization.cpp" "AMDAIELinkExecutables.cpp" "AMDAIELocalizeLogicalObjectFifo.cpp" + "AMDAIELoadAlignmentReset.cpp" "AMDAIELogicalObjFifoSplittingUtils.cpp" "AMDAIELowerExecutableTarget.cpp" "AMDAIELowerFuncArgs.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 5deb1c72d..7c7105b0b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -55,6 +55,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEINSERTCORES #define GEN_PASS_DEF_AMDAIEINSERTLOOPSFORVECTORIZATION #define GEN_PASS_DEF_AMDAIELINKEXECUTABLES +#define GEN_PASS_DEF_AMDAIELOADALIGNMENTRESET #define GEN_PASS_DEF_AMDAIELOCALIZELOGICALOBJECTFIFO #define GEN_PASS_DEF_AMDAIELOWEREXECUTABLETARGET #define GEN_PASS_DEF_AMDAIELOWERINGSTRATEGY diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 6f3e08ebc..454d54ae9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -20,15 +20,13 @@ #include "air/Transform/AIRMiscPasses.h" #include "air/Transform/AffineLoopOptPass.h" #include "iree-amd-aie/IR/AMDAIEAttrs.h" +#include "iree-amd-aie/Transforms/Passes.h" #include "iree-dialects/Dialect/LinalgTransform/Passes.h" #include "iree/compiler/Codegen/Common/Passes.h" #include "iree/compiler/Utils/ToolUtils.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" -#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" -#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" #include "mlir/Dialect/Affine/Passes.h" @@ -541,6 +539,7 @@ void buildAMDAIETransformPassPipeline( }); } + void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, bool enablePacketFlow, TilePassPipeline useTilePipeline) { @@ -623,6 +622,44 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, addMLIRAIELoweringPasses(passManager); } +void addMLIRAIELoweringPasses(OpPassManager &pm) { + { + OpPassManager &devicePM = pm.nest(); + devicePM.addPass(createCanonicalizerPass()); + devicePM.addPass(createAMDAIEDmaToNpuPass()); + devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass()); + devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass()); + devicePM.addPass(createAMDAIEPathfinderPass()); + } + + pm.addPass(createCanonicalizerPass()); + pm.addPass(createConvertLinalgToLoopsPass()); + pm.addPass(createLowerAffinePass()); + pm.addPass(createConvertSCFToCFPass()); + + { + OpPassManager &devicePM = pm.nest(); + devicePM.addPass(createAMDAIELocalizeLocksPass()); + devicePM.addPass(createAMDAIENormalizeAddressSpacesPass()); + devicePM.addPass(createCanonicalizerPass()); + } + + mlir::iree_compiler::aievec::buildConvertVectorToAIEVec(pm); + + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + pm.addPass(aievec::createConvertAIEVecToLLVMPass()); + pm.addPass(createConvertVectorToLLVMPass()); + pm.addPass(memref::createExpandStridedMetadataPass()); + pm.addPass(createLowerAffinePass()); + pm.addPass(createConvertMathToLLVMPass()); + pm.addPass(createArithToLLVMConversionPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + pm.addPass(createAMDAIELoadAlignmentResetPass()); + pm.addPass(createCanonicalizerPass()); +} + // TODO (Erwei): The "packPeel" temporary argument should be removed once // pack-peel and pack-pad share the same pass pipeline. See TODOs inlined below // for details. @@ -796,30 +833,7 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device, addMLIRAIELoweringPasses(passManager); } -void addMLIRAIELoweringPasses(OpPassManager &passManager) { - { - OpPassManager &devicePM = passManager.nest(); - devicePM.addPass(createCanonicalizerPass()); - devicePM.addPass(createAMDAIEDmaToNpuPass()); - devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass()); - devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass()); - devicePM.addPass(createAMDAIEPathfinderPass()); - } - - passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createConvertLinalgToLoopsPass()); - passManager.addPass(createLowerAffinePass()); - passManager.addPass(createConvertSCFToCFPass()); - - { - OpPassManager &devicePM = passManager.nest(); - devicePM.addPass(createAMDAIELocalizeLocksPass()); - devicePM.addPass(createAMDAIENormalizeAddressSpacesPass()); - devicePM.addPass(createCanonicalizerPass()); - } - mlir::iree_compiler::aievec::buildConvertVectorToAIEVec(passManager); -} // NOTE: this runs on the top-level program module containing all hal.executable // ops. @@ -832,28 +846,6 @@ void buildAMDAIELinkingPassPipeline(OpPassManager &passManager) { mlir::createCanonicalizerPass()); } -void addLowerToLLVMPasses(OpPassManager &pm) { - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); - pm.addPass(aievec::createConvertAIEVecToLLVMPass()); - pm.addPass(createConvertVectorToLLVMPass()); - pm.addPass(memref::createExpandStridedMetadataPass()); - pm.addPass(createLowerAffinePass()); - pm.addPass(createConvertMathToLLVMPass()); - pm.addPass(createArithToLLVMConversionPass()); - pm.addPass(createFinalizeMemRefToLLVMConversionPass()); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); - ConvertFuncToLLVMPassOptions opts; - opts.useBarePtrCallConv = true; - pm.addPass(createConvertFuncToLLVMPass(opts)); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); - pm.addPass(createConvertControlFlowToLLVMPass()); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); -} - namespace { #define GEN_PASS_REGISTRATION #include "iree-amd-aie/Transforms/Passes.h.inc" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index df9bf6a06..00aa88694 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -29,6 +29,7 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device, /// currently the default passes used for lowering from AIE dialect. void addMLIRAIELoweringPasses(OpPassManager &passManager); + /// Populates passes needed to lower linalg/arith/math ops to LLVM dialect via /// the structured ops path. The pass manager `pm` here operate on the module /// within the IREE::HAL::ExecutableOp. @@ -39,8 +40,6 @@ void buildAMDAIETransformPassPipeline( bool enableVectorizationPasses, const std::string &pathToUkernels, bool enablePacketFlow); -void addLowerToLLVMPasses(OpPassManager &pm); - /// Populates passes needed to lower the IR via a Pack-Peel based approach. void addPackPeelBasedPassPipeline(OpPassManager &oassManager, TilingConfig &tilingConfig, @@ -268,6 +267,9 @@ std::unique_ptr createAMDAIETileAndFusePass( /// Create pass to propagate pack/unpack ops using upstream patterns. std::unique_ptr createAMDAIEPropagateDataLayoutPass(); +/// Create pass to reset the alignment of LLVM load operations. +std::unique_ptr createAMDAIELoadAlignmentResetPass(); + void registerAMDAIEPasses(); } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index bb8256ea6..e6910bbde 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -302,6 +302,17 @@ def AMDAIELinkExecutables : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinkExecutablesPass()"; } +def AMDAIELoadAlignmentReset : + Pass<"iree-amdaie-load-alignment-reset", ""> { + let summary = "Reset the alignment of the LLVM load operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELoadAlignmentResetPass()"; + let description = [{ + Reset the alignment of the LLVM load operations to the 'unset' + optional value. This is a workaround for an issue in peano, and + should eventually be removed. + }]; +} + def AMDAIELocalizeLogicalObjectfifo : Pass<"iree-amdaie-localize-logicalobjectfifo", "ModuleOp"> { let summary = "Localize logical objectfifos to local parallel loop scopes."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir index 9686ec566..9a0e7ebf9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir @@ -10,6 +10,7 @@ // // 3) Not specifying the flag at all, which should use the default value (1). + // 1) Explicitly disabled: // RUN: iree-compile --iree-hal-target-backends=amd-aie \ // RUN: --compile-to=executable-targets --iree-amdaie-enable-vectorization-passes=0 %s | FileCheck %s -check-prefix=CHECK-DISABLED @@ -23,6 +24,7 @@ // RUN: --compile-to=executable-targets %s | FileCheck %s -check-prefix=CHECK-DEFAULT + func.func @mm_in_bf16_out_f32(%lhs: tensor<64x64xbf16>, %rhs: tensor<64x64xbf16>) -> tensor<64x64xf32> { %empty = tensor.empty() : tensor<64x64xf32> @@ -34,6 +36,6 @@ func.func @mm_in_bf16_out_f32(%lhs: tensor<64x64xbf16>, return %res : tensor<64x64xf32> } -// CHECK-DISABLED-NOT: aievec.matmul -// CHECK-ENABLED: aievec.matmul -// CHECK-DEFAULT: aievec.matmul +// CHECK-DISABLED-NOT: xllvm.intr.aie2.bf.mac16.conf +// CHECK-ENABLED: xllvm.intr.aie2.bf.mac16.conf +// CHECK-DEFAULT: xllvm.intr.aie2.bf.mac16.conf