Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UKernel] Add ukernel to be compiled through peano #1097

Merged
merged 3 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
lower_to_aie_pipeline="objectFifo",
name_suffix="",
use_chess=False,
use_chess_for_ukernel=True,
use_ukernel=False,
run_benchmark=False,
n_repeats=1,
Expand All @@ -59,6 +60,7 @@ def __init__(
self.lower_to_aie_pipeline = lower_to_aie_pipeline
self.name_suffix = name_suffix
self.use_chess = use_chess
self.use_chess_for_ukernel = use_chess_for_ukernel
self.use_ukernel = use_ukernel
self.run_benchmark = run_benchmark
self.n_repeats = n_repeats
Expand Down Expand Up @@ -104,6 +106,7 @@ def __init__(
tile_pipeline = test_params.tile_pipeline
lower_to_aie_pipeline = test_params.lower_to_aie_pipeline
use_chess = test_params.use_chess
use_chess_for_ukernel = test_params.use_chess_for_ukernel
use_ukernel = test_params.use_ukernel
run_benchmark = test_params.run_benchmark
n_repeats = test_params.n_repeats
Expand All @@ -128,8 +131,17 @@ def __init__(
self.labels.append("Peano")

if use_ukernel:
self.name += "_ukernel"
self.labels.append("UKernel")
if use_chess_for_ukernel:
self.name += "_ukernel_chess"
self.add_aie_compilation_flags(
[f"--iree-amd-aie-enable-chess-for-ukernel=1"]
)
else:
self.name += "_ukernel_peano"
self.add_aie_compilation_flags(
[f"--iree-amd-aie-enable-chess-for-ukernel=0"]
)

if run_benchmark:
self.name += "_benchmark"
Expand Down Expand Up @@ -1707,7 +1719,9 @@ def __init__(self):
test_params=TestParams(
use_ukernel=True,
use_chess=False,
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
use_chess_for_ukernel=False,
run_on_target=["npu4"],
tile_pipeline="pack-peel-4-level-tiling",
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=8",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ LogicalResult AIETargetBackend::serializeExecutable(
/*timing=*/options.aie2xclbinTiming,
/*tempDir=*/entryPointWorkDir.str().str(),
/*useChess=*/options.useChess,
/*useChessForUKernel=*/options.useChessForUKernel,
/*verbose=*/options.showInvokedCommands,
/*vitisDir=*/options.vitisInstallDir.empty()
? std::nullopt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ struct AMDAIEOptions {
// Use the chess compiler. The default is to use peano.
bool useChess{false};

// Use the chess compiler for ukernel. The default is to use chess.
bool useChessForUKernel{true};

// Additional flags to run peano's opt with (if peano is the backend compiler
// selected). These are mostly appended on the end of the default flags, but
// some flags may replace existing flags if they conflict.
Expand Down Expand Up @@ -127,6 +130,11 @@ struct AMDAIEOptions {
llvm::cl::cat(category),
llvm::cl::desc("Use the legacy chess compiler"));

binder.opt<bool>(
"iree-amd-aie-enable-chess-for-ukernel", useChessForUKernel,
llvm::cl::cat(category),
llvm::cl::desc("Use the chess compiler for compiling ukernels"));

binder.opt<std::string>(
"iree-amdaie-enable-ukernels", enableAMDAIEUkernels,
llvm::cl::cat(category),
Expand Down
156 changes: 105 additions & 51 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,23 @@ using Path = std::filesystem::path;
namespace mlir::iree_compiler::AMDAIE {
namespace detail {

FailureOr<std::vector<std::string>> flagStringToVector(
const std::string &flags) {
if (flags.empty()) return std::vector<std::string>{};
// Check that flags string is of the form "-flag1 -flag2".
// i.e. that it starts and ends with ".
if (flags.size() < 2 || flags.front() != '"' || flags.back() != '"') {
llvm::errs()
<< "additional peano opt flags must be of the form "
"\"-flag1 -flag2 ...\". Specifically it must start and end with \".";
return failure();
}
// Split the additional flags on whitespace, and then add to the default args.
std::istringstream iss(flags.substr(1, flags.size() - 2));
return std::vector<std::string>{std::istream_iterator<std::string>{iss},
std::istream_iterator<std::string>{}};
}

// Peano's `opt` program optimizes llvm-ir (.ll files). We run it with a system
// call. This functions constructs the flags to pass to `opt`. There are some
// default flags, most of which are copied from llvm-aie. See
Expand All @@ -70,11 +87,10 @@ namespace detail {
// clang-format on
//
// There are also additional flags which have been passed down from the user,
// `additionalPeanoOptFlags`. This function appends these user specific flags,
// `additionalFlags`. This function appends these user specific flags,
// and checks that they are valid. If they are not, it returns failure.
FailureOr<std::vector<std::string>> makePeanoOptArgs(
const std::string &filenameIrIn, const std::string &filenameIrOut,
const std::string &additionalPeanoOptFlags) {
const std::vector<std::string> &additionalFlags) {
std::vector<std::string> args{
// peano has no proper vectorization cost model for AIE
"-vectorize-loops=false",
Expand All @@ -98,33 +114,9 @@ FailureOr<std::vector<std::string>> makePeanoOptArgs(
"--inline-threshold=10",
// missing from libc
"--disable-builtin=memset",
// Source file, IR to optimize
"-S", filenameIrIn,
// Output file, optimized IR
"-o", filenameIrOut};

if (additionalPeanoOptFlags.empty()) return args;

// Check that additionalPeanoOptFlags is of the form "-flag1 -flag2".
// i.e. that it starts and ends with ".
if (additionalPeanoOptFlags.size() < 2 ||
additionalPeanoOptFlags.front() != '"' ||
additionalPeanoOptFlags.back() != '"') {
llvm::errs()
<< "additional peano opt flags must be of the form "
"\"-flag1 -flag2 ...\". Specifically it must start and end with \".";
return failure();
}
};

// TODO(newling) use string_view, shouldn't need to copy the string here.
std::string stripped =
additionalPeanoOptFlags.substr(1, additionalPeanoOptFlags.size() - 2);

// Split the additional flags on whitespace, and then add to the default args.
std::istringstream iss(stripped);
std::vector<std::string> additionalFlags{
std::istream_iterator<std::string>{iss},
std::istream_iterator<std::string>{}};
if (additionalFlags.empty()) return args;

// Return true if `flag` is an optimization level flag, like -O2.
auto isOptLevelFlag = [](const std::string &flag) {
Expand Down Expand Up @@ -202,10 +194,14 @@ static const std::string _CHESS_INTRINSIC_WRAPPER_CPP{
static const std::string _MM_NPU1_CC{
#include "mm_npu1.cc"
};
// This is a string that contains a mm kernel for npu4.
// This is a string that contains npu4 kernels for compilation by chess.
static const std::string _MM_NPU4_CC{
#include "mm_npu4.cc"
};
// This is a string that contains npu4 kernels for compilation by peano.
static const std::string _MM_NPU4_PEANO_CC{
#include "mm_npu4_peano.cc"
};

FailureOr<std::string> getTargetDir(const std::string &npuVersion) {
if (npuVersion == "npu1") return std::string{"target_aie_ml"};
Expand Down Expand Up @@ -509,6 +505,27 @@ LogicalResult runTool(
return success();
}

static LogicalResult assembleFileUsingPeano(
const std::string &inputFile, const std::string &outputFile,
const std::vector<std::string> &extraArgs, Path &_tempDir, Path &peanoDir,
const std::string &npuVersion, bool verbose) {
std::vector<std::string> args;
args.reserve(args.size() + std::distance(extraArgs.begin(), extraArgs.end()));
args.insert(args.end(), extraArgs.begin(), extraArgs.end());
// TODO(jornt): O0 fails with peano, so we use O1 for now.
args.emplace_back("-O1");
args.emplace_back("-c");
args.emplace_back(inputFile);
args.emplace_back("-o");
args.emplace_back(outputFile);
if (verbose) args.emplace_back("-v");
if (failed(runTool((peanoDir / "bin" / "clang").string(), args, verbose))) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, you can do this with peano? I must play around with this, see what the .ll generated looks like.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, this is seems quite useful for experimentation and speed of dev. However, I have run into a couple of issues already that I think you have seen as well with vectorization:

  • O0 results in compilation errors
  • Sometimes the stack size usage blows up and this results in incorrect results if it's larger than 1024 and I needed to manually increase it:
    DefaultValuedAttr<I32Attr, "0x400">:$stack_size,
    . Ideally, we should know the stack size usage before allocating a stack buffer...

Also, it should work with the high level AIE_API as well, but then you need to include those header files and I think they are only available inside Vitis. But with Vitis installed on our CI machines, that should work as well in theory, although I haven't tried it yet.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stack overflow, I suspect that's what is causing the current numerical issues I'm observing Xilinx/llvm-aie#342 -- thanks for this pointer, very well timed!

llvm::errs() << "Failed to assemble " << outputFile << ".o with peano";
return failure();
}
return success();
}

LogicalResult assembleFileUsingChess(const std::string &inputFile,
const std::string &outputFile,
const std::vector<std::string> &extraArgs,
Expand Down Expand Up @@ -560,10 +577,15 @@ static auto assembleStringUsingChess =
std::bind(assembleStringUsing, assembleFileUsingChess, _1, _2, _3, _4, _5,
_6, _7, _8, _9);

static auto assembleStringUsingPeano =
std::bind(assembleStringUsing, assembleFileUsingPeano, _1, _2, _3, _4, _5,
_6, _7, _8, _9);

// Generate the elf files for the core
LogicalResult generateCoreElfFiles(AIE::DeviceOp deviceOp,
const std::string &objFile, Path &tempDir,
bool useChess, std::optional<Path> vitisDir,
bool useChess, bool useChessForUKernel,
std::optional<Path> vitisDir,
const std::string &targetArch, bool verbose,
Path peanoDir, const std::string &npuVersion,
const std::optional<std::string> &ukernel) {
Expand All @@ -578,7 +600,7 @@ LogicalResult generateCoreElfFiles(AIE::DeviceOp deviceOp,
ukernelFileName = "mm_npu1.cc";
ukernelObjectName = "mm_npu1.o";
} else if (npuVersion == "npu4") {
ukernelFileContent = _MM_NPU4_CC;
ukernelFileContent = useChessForUKernel ? _MM_NPU4_CC : _MM_NPU4_PEANO_CC;
ukernelFileName = "mm_npu4.cc";
ukernelObjectName = "mm_npu4.o";
} else {
Expand Down Expand Up @@ -613,15 +635,30 @@ LogicalResult generateCoreElfFiles(AIE::DeviceOp deviceOp,
return failure();
}
if (!std::filesystem::exists(cwd / ukernelObjectName)) {
mmObjectFilePath = assembleStringUsingChess(
/*inputFileStr=*/ukernelFileContent,
/*inputFileName=*/ukernelFileName,
/*outputFileName=*/ukernelObjectName,
/*outputDir=*/cwd,
/*extraArgs*/ std::vector<std::string>{},
/*workDir=*/tempDir,
/*vitisDir=*/*maybeVitisDir,
/*npuVersion*/ npuVersion, verbose);
if (useChessForUKernel) {
mmObjectFilePath = assembleStringUsingChess(
/*inputFileStr=*/ukernelFileContent,
/*inputFileName=*/ukernelFileName,
/*outputFileName=*/ukernelObjectName,
/*outputDir=*/cwd,
/*extraArgs=*/std::vector<std::string>{},
/*workDir=*/tempDir,
/*vitisDir=*/*maybeVitisDir,
/*npuVersion*/ npuVersion, verbose);
} else {
std::string targetLower = StringRef(targetArch).lower();
std::vector<std::string> extraArgs{"--target=" + targetLower +
"-none-unknown-elf"};
mmObjectFilePath = assembleStringUsingPeano(
/*inputFileStr=*/ukernelFileContent,
/*inputFileName=*/ukernelFileName,
/*outputFileName=*/ukernelObjectName,
/*outputDir=*/cwd,
/*extraArgs=*/extraArgs,
/*workDir=*/tempDir,
/*vitisDir=*/peanoDir,
/*npuVersion*/ npuVersion, verbose);
}
if (failed(mmObjectFilePath)) return failure();
} else {
mmObjectFilePath = cwd / ukernelObjectName;
Expand Down Expand Up @@ -1113,15 +1150,30 @@ LogicalResult generateUnifiedObject(

std::string OptLLVMIRFile = (tempDir / "input.opt.ll").string();

FailureOr<std::vector<std::string>> peanoArgs =
FailureOr<std::vector<std::string>> maybeAdditionalPeanoArgs =
mlir::iree_compiler::AMDAIE::detail::flagStringToVector(
additionalPeanoOptFlags);
if (failed(maybeAdditionalPeanoArgs)) {
llvm::errs() << "Failed to parse additional peano args\n";
return failure();
}

FailureOr<std::vector<std::string>> maybePeanoArgs =
mlir::iree_compiler::AMDAIE::detail::makePeanoOptArgs(
LLVMIRFile, OptLLVMIRFile, additionalPeanoOptFlags);
if (failed(peanoArgs)) {
maybeAdditionalPeanoArgs.value());
if (failed(maybePeanoArgs)) {
llvm::errs() << "Failed to make peano opt args\n";
return failure();
}

if (failed(runTool(peanoOptBin.string(), peanoArgs.value(), verbose))) {
std::vector<std::string> peanoArgs = maybePeanoArgs.value();
// Source file, IR to optimize
peanoArgs.emplace_back("-S");
peanoArgs.emplace_back(LLVMIRFile);
// Output file, optimized IR
peanoArgs.emplace_back("-o");
peanoArgs.emplace_back(OptLLVMIRFile);

if (failed(runTool(peanoOptBin.string(), peanoArgs, verbose))) {
llvm::errs() << "Failed to optimize ll with peano\n";
llvm::errs() << "Using peano at provided path: '" << peanoDir.string()
<< "'\n";
Expand Down Expand Up @@ -1216,9 +1268,10 @@ LogicalResult aie2xclbin(
const std::optional<std::string> &outputNPU, bool emitCtrlPkt,
const std::string &artifactPath, bool printIRBeforeAll,
bool printIRAfterAll, bool printIRModuleScope, bool timing,
const std::string &tempDir, bool useChess, bool verbose,
const std::optional<std::string> &vitisDir, const std::string &targetArch,
const std::string &npuVersion, const std::string &peanoDir,
const std::string &tempDir, bool useChess, bool useChessForUKernel,
bool verbose, const std::optional<std::string> &vitisDir,
const std::string &targetArch, const std::string &npuVersion,
const std::string &peanoDir,
const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal,
const std::string &xclBinKernelID, const std::string &xclBinKernelName,
const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir,
Expand Down Expand Up @@ -1248,8 +1301,9 @@ LogicalResult aie2xclbin(
}

if (failed(generateCoreElfFiles(deviceOp, unifiedObj.string(), tempDirPath,
useChess, vitisDirPath, targetArch, verbose,
peanoDir, npuVersion, ukernel))) {
useChess, useChessForUKernel, vitisDirPath,
targetArch, verbose, peanoDir, npuVersion,
ukernel))) {
llvm::errs() << "Failed to generate core ELF file(s)\n";
return failure();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ mlir::LogicalResult aie2xclbin(
const std::optional<std::string> &outputNPU, bool emitCtrlPkt,
const std::string &artifactPath, bool printIRBeforeAll,
bool printIRAfterAll, bool printIRModuleScope, bool timing,
const std::string &tempDir, bool useChess, bool verbose,
const std::optional<std::string> &vitisDir, const std::string &targetArch,
const std::string &npuVersion, const std::string &peanoDir,
const std::string &tempDir, bool useChess, bool useChessForUKernel,
bool verbose, const std::optional<std::string> &vitisDir,
const std::string &targetArch, const std::string &npuVersion,
const std::string &peanoDir,
const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal,
const std::string &xclBinKernelID, const std::string &xclBinKernelName,
const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir,
Expand All @@ -31,8 +32,9 @@ mlir::LogicalResult emitNpuInstructions(xilinx::AIE::DeviceOp deviceOp,
const std::string &outputNPU);

namespace detail {
FailureOr<std::vector<std::string>> flagStringToVector(
const std::string &flags);
FailureOr<std::vector<std::string>> makePeanoOptArgs(
const std::string &filenameIrIn, const std::string &filenameIrOut,
const std::string &additionalPeanoOptFlags);
}
const std::vector<std::string> &additionalPeanoOptFlags);
} // namespace detail
} // namespace mlir::iree_compiler::AMDAIE
Loading
Loading