Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
newling committed Oct 31, 2024
1 parent 7a84642 commit 803d182
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 206 deletions.
115 changes: 3 additions & 112 deletions compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
//===----------------------------------------------------------------------===//

#include "aievec/AIEVecOps.h"

#include "AIEVecUtils.h"
#include "llvm/ADT/TypeSwitch.h"
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/TypeUtilities.h"

Expand Down Expand Up @@ -602,117 +604,6 @@ ParseResult FMAElemOp::parse(OpAsmParser &parser, OperationState &result) {
return parseMulFMAElemOp(parser, result, true);
}

//===----------------------------------------------------------------------===//
// ConcatOp
//===----------------------------------------------------------------------===//

// Print out Concat op.
void ConcatOp::print(OpAsmPrinter &p) {
// Print the source vectors
assert(!getSources().empty() && "concat source empty");
p << " " << getSources();

// Print the attributes
p.printOptionalAttrDict((*this)->getAttrs());

// And now print the types
p << " : " << getSources().getTypes().front() << ", "
<< getResult().getType();
}

// Verify Concat op.
LogicalResult ConcatOp::verify() {
// Must be concatenating at least two sources
if (getSources().size() < 2)
return emitError("Must concatenate at least two vectors");

// Verify the types
VectorType sourceType =
llvm::dyn_cast<VectorType>(getSources().getTypes().front());
VectorType resultType = llvm::dyn_cast<VectorType>(getResult().getType());
if (!sourceType || !resultType)
return emitError("requires vector type");

SmallVector<Value, 8> srcs(getSources().begin(), getSources().end());
// All the sources must have the same type
for (auto source : srcs) {
VectorType type = llvm::dyn_cast<VectorType>(source.getType());
if (!type)
return emitError("requires vector type");
if (type != sourceType)
return emitError("All sources must have same type");
}

// The lanes in concatenated type must be the sum of lanes of source vector
unsigned totalLanes = 0;
for (auto source : srcs) {
VectorType type = llvm::dyn_cast<VectorType>(source.getType());
totalLanes += getVectorLaneSize(type);
}

if (totalLanes != getVectorLaneSize(resultType))
return emitError("mismatch between vector lanes "
"and sum of source lanes");

return success();
}

// Parse Concat op.
ParseResult ConcatOp::parse(OpAsmParser &parser, OperationState &result) {
llvm::SMLoc typesLoc;
SmallVector<Type, 2> types;
SmallVector<OpAsmParser::UnresolvedOperand, 8> sources;

// Parse the source vectors
if (parser.parseOperandList(sources))
return failure();

// Parse all the attributes and types
if (parser.parseOptionalAttrDict(result.attributes) ||
parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
return failure();

// Currently there are no attributes in concat op
if (!result.attributes.getAttrs().empty())
return parser.emitError(typesLoc, "expects no attribute");

// Assert that there are two types (type of all sources, and result)
if (types.size() != 2)
return parser.emitError(typesLoc, "requires two types");

// Some verification
VectorType sourceType = llvm::dyn_cast<VectorType>(types[0]);
VectorType resultType = llvm::dyn_cast<VectorType>(types[1]);
if (!sourceType || !resultType)
return parser.emitError(typesLoc, "requires vector type");

// Populate the source vectors in result
if (parser.resolveOperands(sources, sourceType, result.operands))
return failure();

return parser.addTypeToList(resultType, result.types);
}

LogicalResult
ConcatOp::inferReturnTypes(MLIRContext *, std::optional<Location>,
ConcatOp::Adaptor adaptor,
SmallVectorImpl<Type> &inferredReturnTypes) {
SmallVector<Value, 8> srcs(adaptor.getSources().begin(),
adaptor.getSources().end());
unsigned totalLength = 0;
for (auto source : srcs) {
VectorType type = llvm::dyn_cast<VectorType>(source.getType());
assert(type.getRank() == 1 &&
"only rank 1 vectors currently supported by concat");
totalLength += type.getDimSize(0);
}
inferredReturnTypes.push_back(VectorType::get(
{totalLength},
llvm::dyn_cast<VectorType>(srcs[0].getType()).getElementType()));
return success();
}



//===----------------------------------------------------------------------===//
// ExtOp
Expand Down
2 changes: 2 additions & 0 deletions compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def AIEVec_ShiftOp:
have the same number of lanes and element types.
`$result = shift($lhs, $rhs, $shift)`
}];

let hasFolder = 1;
}

def AIEVec_ExtOp:
Expand Down
71 changes: 0 additions & 71 deletions compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1008,75 +1008,4 @@ void registerConvertAIEVecToLLVMPass() {
});
}

class ConcatOpConversion
: public mlir::ConvertOpToLLVMPattern<aievec::ConcatOp> {
public:
using ConvertOpToLLVMPattern<aievec::ConcatOp>::ConvertOpToLLVMPattern;

LogicalResult
matchAndRewrite(aievec::ConcatOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Location loc = op.getLoc();

SmallVector<Value> sources = adaptor.getSources();
Value src = sources.front();
VectorType srcType = cast<VectorType>(src.getType());
Type srcScalarType = srcType.getElementType();
unsigned srcBitWidth = srcScalarType.getIntOrFloatBitWidth();
int srcLanes = getVectorLaneSize(srcType);
int srcVectorSize = srcBitWidth * srcLanes;

Value result = op.getResult();
VectorType resultType = cast<VectorType>(result.getType());
Type resultScaTy = resultType.getElementType();
unsigned resultBitWidth = resultScaTy.getIntOrFloatBitWidth();
int resultLanes = getVectorLaneSize(resultType);
int resultVectorSize = resultBitWidth * resultLanes;

if (sources.size() != 2 && sources.size() != 4) {
op.emitWarning() << "aievec.concat with " << sources.size()
<< " operands is not supported.\n";
return failure();
}

// create xllvm intrinsic
Value concatOp = nullptr;
if (srcVectorSize == 256 && resultVectorSize == 512) {
concatOp = rewriter.create<xllvm::ConcatI512I256IntrOp>(
loc, VectorType::get({16}, rewriter.getI32Type()),
forceCastOperandsToSignature(
rewriter, loc, adaptor.getSources(),
{VectorType::get({8}, rewriter.getI32Type()),
VectorType::get({8}, rewriter.getI32Type())}));
} else if (srcVectorSize == 256 && resultVectorSize == 1024) {
concatOp = rewriter.create<xllvm::ConcatI1024I256IntrOp>(
loc, VectorType::get({32}, rewriter.getI32Type()),
forceCastOperandsToSignature(
rewriter, loc, adaptor.getSources(),
{VectorType::get({8}, rewriter.getI32Type()),
VectorType::get({8}, rewriter.getI32Type()),
VectorType::get({8}, rewriter.getI32Type()),
VectorType::get({8}, rewriter.getI32Type())}));
} else if (srcVectorSize == 512 && resultVectorSize == 1024) {
concatOp = rewriter.create<xllvm::ConcatI1024I512IntrOp>(
loc, VectorType::get({32}, rewriter.getI32Type()),
forceCastOperandsToSignature(
rewriter, loc, adaptor.getSources(),
{VectorType::get({16}, rewriter.getI32Type()),
VectorType::get({16}, rewriter.getI32Type())}));
} else {
op.emitWarning() << "aievec.concat with " << srcVectorSize
<< "-bit operands, and " << resultVectorSize
<< "-bit result is not supported.\n";
return failure();
}

// create bitcast for result
rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, op.getResult().getType(),
concatOp);

return success();
}
};

} // namespace mlir::iree_compiler::aievec
51 changes: 33 additions & 18 deletions compiler/plugins/target/AMD-AIE/aievec/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,29 +43,44 @@ void registerCanonicalizeVectorForAIEVecPass();

/**
* This pass ensures that reads from AIE tile memory are aligned according to
* hardware constraints. For example, suppose we have 8 bytes in tile memory,
* represented as:
* hardware constraints. For example, suppose we have 128 bytes in tile memory,
* represented in hex as:
*
* 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07
* 0x00 0x01 ... 0x7E 0x7F
*
* and suppose we want to extract bytes at addresses 0x01 and 0x02 into a
* register. Suppose that the hardware requires memory-to-register transfers to
* begin at even addresses (in reality for AIE-2 is that the transfers must be
* 256-bit aligned). To work around this constraint, we use the following
* approach:
* On AIE-2, the (vector) read instructions from the tile memory into registers
* must be aligned to 256-bits (32-bytes). So if we want to read 64 bytes starting
* from 0x00 that is fine, but if we want to read 64 bytes starting from 0x01,
* then we cannot use a vector read instruction directly. To work around this
* constraint, we do the following:
*
* 1. First, perform a wider read which loads bytes from 0x00 to 0x03 into a
* larger register.
* 1. Perform a wider read, that loads 128 bytes (2x as many as we want)
* starting from 0x00 into a larger register. That is, bytes 0x00-0x7F are
* loaded, so we have 1 'junk' byte at the beginning and 63 'junk' bytes at
* the end.
*
* 2. Then, extract the target bytes 0x01 and 0x02 from this larger register
* into a smaller register in two steps at the hardware level:
* a) Use two extract instructions to split the read data (0x00-0x03) into
* two parts: 0x00-0x01 and 0x02-0x03. Reference instruction:
* https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorconv__elem.html)
* 2. Extract the target bytes 0x01 ... 0x40 from the larger register into a
* smaller register in 2 steps, using 2 AIE specific instructions:
*
* b) Use a shift operation to combine these parts, aligning the target
* bytes (0x01-0x02) into the smaller register. Reference instruction:
* https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html)
* a) Extract:
* https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorconv__elem.html
*
* b) Shift:
* https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html
*
* First, we use the extract instruction to split the read 128-bytes into two
* halves, 0x00-0x3F and 0x40-0x7F, each in its own 64-byte register. Then, we
* use a shift operation to combine the upper 31 bytes from the first half
* and the lower 33 bytes from the second half into a new 64-byte register.
* This new register contains exactly the 64 bytes we want to read, starting
* from 0x01.
*
* If we want to read 32 bytes starting from 0x01, we can use a similar approach.
* The only consideration is that the shift operation requires 64-byte inputs,
* so the order of the of the shift and extracts is reversed.
*
* We do not currently support unaligned reads of vectors which are not 32-bytes
* or 64-bytes in length.
*
* TODO(newling) use this same approach to align writes to unaligned memory.
* */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -870,8 +870,6 @@ void populateBubbleSignExtensionsLate(RewritePatternSet &patterns) {
}


// // TODO(newling) don't go via ExtractStridedSlice, as for this, offset must a
// constant.

// This pass converts standard vector ops into a subset of `Vector` ops more
// amenable to being converted to `AIEVec`.
Expand Down Expand Up @@ -932,12 +930,15 @@ struct CanonicalizeVectorForAIEVecPass

/// Returns either
/// 1) failure, if there is definitely an error that should be percolated up.
///
/// 2) a new transfer_read operation that is sufficiently aligned, if the old
/// transfer_read is determined to be insufficiently aligned and it is
/// possible to create a new transfer_read
/// possible to create a new transfer_read.
///
/// 3) the original transfer_read operation, otherwise.
///
/// This is currently designed for AIE2 vector constraints.
/// This is currently hard-wired for AIE2 (phoenix) HW constraints.

FailureOr<Value> getAlignedTransferRead(vector::TransferReadOp readOp,
IRRewriter &rewriter) {
//`alignBits`: transfer_reads must be aligned to this number of bits,
Expand Down Expand Up @@ -1072,7 +1073,9 @@ struct AlignTransferReadsPass
auto op = getOperation();
IRRewriter rewriter(&getContext());
op->walk([&](vector::TransferReadOp transferReadOp) {
(void)getAlignedTransferRead(transferReadOp, rewriter);
if (failed(getAlignedTransferRead(transferReadOp, rewriter))) {
signalPassFailure();
}
});
}
};
Expand Down

0 comments on commit 803d182

Please sign in to comment.