update

nod-ai · Oct 31, 2024 · 803d182 · 803d182
1 parent 7a84642
commit 803d182
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 206 deletions.
diff --git a/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.cpp b/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.cpp
@@ -11,8 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "aievec/AIEVecOps.h"
-
 #include "AIEVecUtils.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/TypeUtilities.h"
 
@@ -602,117 +604,6 @@ ParseResult FMAElemOp::parse(OpAsmParser &parser, OperationState &result) {
   return parseMulFMAElemOp(parser, result, true);
 }
 
-//===----------------------------------------------------------------------===//
-// ConcatOp
-//===----------------------------------------------------------------------===//
-
-// Print out Concat op.
-void ConcatOp::print(OpAsmPrinter &p) {
-  // Print the source vectors
-  assert(!getSources().empty() && "concat source empty");
-  p << " " << getSources();
-
-  // Print the attributes
-  p.printOptionalAttrDict((*this)->getAttrs());
-
-  // And now print the types
-  p << " : " << getSources().getTypes().front() << ", "
-    << getResult().getType();
-}
-
-// Verify Concat op.
-LogicalResult ConcatOp::verify() {
-  // Must be concatenating at least two sources
-  if (getSources().size() < 2)
-    return emitError("Must concatenate at least two vectors");
-
-  // Verify the types
-  VectorType sourceType =
-      llvm::dyn_cast<VectorType>(getSources().getTypes().front());
-  VectorType resultType = llvm::dyn_cast<VectorType>(getResult().getType());
-  if (!sourceType || !resultType)
-    return emitError("requires vector type");
-
-  SmallVector<Value, 8> srcs(getSources().begin(), getSources().end());
-  // All the sources must have the same type
-  for (auto source : srcs) {
-    VectorType type = llvm::dyn_cast<VectorType>(source.getType());
-    if (!type)
-      return emitError("requires vector type");
-    if (type != sourceType)
-      return emitError("All sources must have same type");
-  }
-
-  // The lanes in concatenated type must be the sum of lanes of source vector
-  unsigned totalLanes = 0;
-  for (auto source : srcs) {
-    VectorType type = llvm::dyn_cast<VectorType>(source.getType());
-    totalLanes += getVectorLaneSize(type);
-  }
-
-  if (totalLanes != getVectorLaneSize(resultType))
-    return emitError("mismatch between vector lanes "
-                     "and sum of source lanes");
-
-  return success();
-}
-
-// Parse Concat op.
-ParseResult ConcatOp::parse(OpAsmParser &parser, OperationState &result) {
-  llvm::SMLoc typesLoc;
-  SmallVector<Type, 2> types;
-  SmallVector<OpAsmParser::UnresolvedOperand, 8> sources;
-
-  // Parse the source vectors
-  if (parser.parseOperandList(sources))
-    return failure();
-
-  // Parse all the attributes and types
-  if (parser.parseOptionalAttrDict(result.attributes) ||
-      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
-    return failure();
-
-  // Currently there are no attributes in concat op
-  if (!result.attributes.getAttrs().empty())
-    return parser.emitError(typesLoc, "expects no attribute");
-
-  // Assert that there are two types (type of all sources, and result)
-  if (types.size() != 2)
-    return parser.emitError(typesLoc, "requires two types");
-
-  // Some verification
-  VectorType sourceType = llvm::dyn_cast<VectorType>(types[0]);
-  VectorType resultType = llvm::dyn_cast<VectorType>(types[1]);
-  if (!sourceType || !resultType)
-    return parser.emitError(typesLoc, "requires vector type");
-
-  // Populate the source vectors in result
-  if (parser.resolveOperands(sources, sourceType, result.operands))
-    return failure();
-
-  return parser.addTypeToList(resultType, result.types);
-}
-
-LogicalResult
-ConcatOp::inferReturnTypes(MLIRContext *, std::optional<Location>,
-                           ConcatOp::Adaptor adaptor,
-                           SmallVectorImpl<Type> &inferredReturnTypes) {
-  SmallVector<Value, 8> srcs(adaptor.getSources().begin(),
-                             adaptor.getSources().end());
-  unsigned totalLength = 0;
-  for (auto source : srcs) {
-    VectorType type = llvm::dyn_cast<VectorType>(source.getType());
-    assert(type.getRank() == 1 &&
-           "only rank 1 vectors currently supported by concat");
-    totalLength += type.getDimSize(0);
-  }
-  inferredReturnTypes.push_back(VectorType::get(
-      {totalLength},
-      llvm::dyn_cast<VectorType>(srcs[0].getType()).getElementType()));
-  return success();
-}
-
-
 
 //===----------------------------------------------------------------------===//
 // ExtOp

diff --git a/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.td b/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.td
@@ -49,6 +49,8 @@ def AIEVec_ShiftOp:
     have the same number of lanes and element types.
     `$result = shift($lhs, $rhs, $shift)`
   }];
+
+  let hasFolder = 1;
 }
 
 def AIEVec_ExtOp:

diff --git a/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp b/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp
@@ -1008,75 +1008,4 @@ void registerConvertAIEVecToLLVMPass() {
   });
 }
 
-class ConcatOpConversion
-    : public mlir::ConvertOpToLLVMPattern<aievec::ConcatOp> {
-public:
-  using ConvertOpToLLVMPattern<aievec::ConcatOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult
-  matchAndRewrite(aievec::ConcatOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-
-    SmallVector<Value> sources = adaptor.getSources();
-    Value src = sources.front();
-    VectorType srcType = cast<VectorType>(src.getType());
-    Type srcScalarType = srcType.getElementType();
-    unsigned srcBitWidth = srcScalarType.getIntOrFloatBitWidth();
-    int srcLanes = getVectorLaneSize(srcType);
-    int srcVectorSize = srcBitWidth * srcLanes;
-
-    Value result = op.getResult();
-    VectorType resultType = cast<VectorType>(result.getType());
-    Type resultScaTy = resultType.getElementType();
-    unsigned resultBitWidth = resultScaTy.getIntOrFloatBitWidth();
-    int resultLanes = getVectorLaneSize(resultType);
-    int resultVectorSize = resultBitWidth * resultLanes;
-
-    if (sources.size() != 2 && sources.size() != 4) {
-      op.emitWarning() << "aievec.concat with " << sources.size()
-                       << " operands is not supported.\n";
-      return failure();
-    }
-
-    // create xllvm intrinsic
-    Value concatOp = nullptr;
-    if (srcVectorSize == 256 && resultVectorSize == 512) {
-      concatOp = rewriter.create<xllvm::ConcatI512I256IntrOp>(
-          loc, VectorType::get({16}, rewriter.getI32Type()),
-          forceCastOperandsToSignature(
-              rewriter, loc, adaptor.getSources(),
-              {VectorType::get({8}, rewriter.getI32Type()),
-               VectorType::get({8}, rewriter.getI32Type())}));
-    } else if (srcVectorSize == 256 && resultVectorSize == 1024) {
-      concatOp = rewriter.create<xllvm::ConcatI1024I256IntrOp>(
-          loc, VectorType::get({32}, rewriter.getI32Type()),
-          forceCastOperandsToSignature(
-              rewriter, loc, adaptor.getSources(),
-              {VectorType::get({8}, rewriter.getI32Type()),
-               VectorType::get({8}, rewriter.getI32Type()),
-               VectorType::get({8}, rewriter.getI32Type()),
-               VectorType::get({8}, rewriter.getI32Type())}));
-    } else if (srcVectorSize == 512 && resultVectorSize == 1024) {
-      concatOp = rewriter.create<xllvm::ConcatI1024I512IntrOp>(
-          loc, VectorType::get({32}, rewriter.getI32Type()),
-          forceCastOperandsToSignature(
-              rewriter, loc, adaptor.getSources(),
-              {VectorType::get({16}, rewriter.getI32Type()),
-               VectorType::get({16}, rewriter.getI32Type())}));
-    } else {
-      op.emitWarning() << "aievec.concat with " << srcVectorSize
-                       << "-bit operands, and " << resultVectorSize
-                       << "-bit result is not supported.\n";
-      return failure();
-    }
-
-    // create bitcast for result
-    rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, op.getResult().getType(),
-                                                 concatOp);
-
-    return success();
-  }
-};
-
 }  // namespace mlir::iree_compiler::aievec
diff --git a/compiler/plugins/target/AMD-AIE/aievec/Passes.h b/compiler/plugins/target/AMD-AIE/aievec/Passes.h
@@ -43,29 +43,44 @@ void registerCanonicalizeVectorForAIEVecPass();
 
 /**
  * This pass ensures that reads from AIE tile memory are aligned according to
- * hardware constraints. For example, suppose we have 8 bytes in tile memory,
- * represented as:
+ * hardware constraints. For example, suppose we have 128 bytes in tile memory,
+ * represented in hex as:
  *
- *    0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07
+ *    0x00 0x01 ... 0x7E 0x7F
  *
- * and suppose we want to extract bytes at addresses 0x01 and 0x02 into a
- * register. Suppose that the hardware requires memory-to-register transfers to
- * begin at even addresses (in reality for AIE-2 is that the transfers must be
- * 256-bit aligned). To work around this constraint, we use the following
- * approach:
+ * On AIE-2, the (vector) read instructions from the tile memory into registers
+ * must be aligned to 256-bits (32-bytes). So if we want to read 64 bytes starting
+ * from 0x00 that is fine, but if we want to read 64 bytes starting from 0x01,
+ * then we cannot use a vector read instruction directly. To work around this
+ * constraint, we do the following:
  *
- *  1. First, perform a wider read which loads bytes from 0x00 to 0x03 into a
- * larger register.
+ * 1. Perform a wider read, that loads 128 bytes (2x as many as we want)
+ *    starting from 0x00 into a larger register. That is, bytes 0x00-0x7F are
+ *    loaded, so we have 1 'junk' byte at the beginning and 63 'junk' bytes at
+ *    the end.
  *
- *  2. Then, extract the target bytes 0x01 and 0x02 from this larger register
- * into a smaller register in two steps at the hardware level:
- *    a) Use two extract instructions to split the read data (0x00-0x03) into
- *       two parts: 0x00-0x01 and 0x02-0x03. Reference instruction:
- * https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorconv__elem.html)
+ * 2. Extract the target bytes 0x01 ... 0x40 from the larger register into a
+ *    smaller register in 2 steps, using 2 AIE specific instructions:
  *
- *   b) Use a shift operation to combine these parts, aligning the target
- *      bytes (0x01-0x02) into the smaller register. Reference instruction:
- * https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html)
+ *   a) Extract:
+ *      https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorconv__elem.html
+ *
+ *   b) Shift:
+ *      https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html
+ *
+ *   First, we use the extract instruction to split the read 128-bytes into two
+ *   halves, 0x00-0x3F and 0x40-0x7F, each in its own 64-byte register. Then, we
+ *   use a shift operation to combine the upper 31 bytes from the first half
+ *   and the lower 33 bytes from the second half into a new 64-byte register.
+ *   This new register contains exactly the 64 bytes we want to read, starting
+ *   from 0x01.
+ *
+ * If we want to read 32 bytes starting from 0x01, we can use a similar approach.
+ * The only consideration is that the shift operation requires 64-byte inputs,
+ * so the order of the of the shift and extracts is reversed.
+ *
+ * We do not currently support unaligned reads of vectors which are not 32-bytes
+ * or 64-bytes in length.
  *
  * TODO(newling) use this same approach to align writes to unaligned memory.
  *  */

diff --git a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
@@ -870,8 +870,6 @@ void populateBubbleSignExtensionsLate(RewritePatternSet &patterns) {
 }
 
 
-// // TODO(newling) don't go via ExtractStridedSlice, as for this, offset must a
-// constant.
 
 // This pass converts standard vector ops into a subset of `Vector` ops more
 // amenable to being converted to `AIEVec`.
@@ -932,12 +930,15 @@ struct CanonicalizeVectorForAIEVecPass
 
 /// Returns either
 /// 1) failure, if there is definitely an error that should be percolated up.
+///
 /// 2) a new transfer_read operation that is sufficiently aligned, if the old
 ///    transfer_read is determined to be insufficiently aligned and it is
-///    possible to create a new transfer_read
+///    possible to create a new transfer_read.
+///
 /// 3) the original transfer_read operation, otherwise.
 ///
-/// This is currently designed for AIE2 vector constraints.
+/// This is currently hard-wired for AIE2 (phoenix) HW constraints.
+
 FailureOr<Value> getAlignedTransferRead(vector::TransferReadOp readOp,
                                         IRRewriter &rewriter) {
   //`alignBits`: transfer_reads must be aligned to this number of bits,
@@ -1072,7 +1073,9 @@ struct AlignTransferReadsPass
     auto op = getOperation();
     IRRewriter rewriter(&getContext());
     op->walk([&](vector::TransferReadOp transferReadOp) {
-      (void)getAlignedTransferRead(transferReadOp, rewriter);
+      if (failed(getAlignedTransferRead(transferReadOp, rewriter))) {
+        signalPassFailure();
+      }
     });
   }
 };