Merge branch 'main' into aievec_copy_from_mlir_aie

nod-ai · Oct 31, 2024 · a34cf73 · a34cf73
2 parents 88f50eb + 38118dc
commit a34cf73
Show file tree

Hide file tree

Showing 20 changed files with 360 additions and 269 deletions.
diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
@@ -57,7 +57,7 @@ jobs:
       - name: Python deps
         run: |
           pip install -r third_party/iree/runtime/bindings/python/iree/runtime/build_requirements.txt
-          pip install pyyaml
+          pip install pyyaml pybind11==2.13.6 nanobind==2.2.0
 
       - name: Enable cache
         uses: actions/cache/restore@v3

diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Python deps
         run: |
           pip install -r third_party/iree/runtime/bindings/python/iree/runtime/build_requirements.txt
-          pip install pytest
+          pip install pytest pybind11==2.13.6 nanobind==2.2.0
 
       - name: Enable cache
         uses: actions/cache/restore@v3

diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml
@@ -81,7 +81,7 @@ jobs:
       - name: Python deps
         run: |
           pip install -r third_party\iree\runtime\bindings\python\iree\runtime\build_requirements.txt
-          pip install pyyaml
+          pip install pyyaml pybind11==2.13.6 nanobind==2.2.0
 
       - name: Enable cache
         uses: actions/cache/restore@v3

diff --git a/README.md b/README.md
@@ -37,6 +37,22 @@ git \
 The above avoids cloning entire repo histories for submodules, and skips a few, currently, unused, 
 submodules that are nested in IREE.
 
+### Dependencies
+
+#### For Linux
+
+Build and install `xdna-driver`, use commit `59f1d62`:
+
+```
+git clone [email protected]:amd/xdna-driver.git
+cd <root-of-source-tree>
+# get code for submodules
+git checkout 59f1d62
+git submodule update --init --recursive
+```
+
+Follow the instructions to build and install the driver module: https://github.com/amd/xdna-driver.
+
 ## Building (along with IREE)
 
 ### Just show me the CMake

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -537,8 +537,8 @@ def aie_vs_llvm_cpu(
     config,
     test_file,
     use_ukernel=False,
-    tile_pipeline="pad-pack",
-    lower_to_aie_pipeline="air",
+    tile_pipeline="pack-peel",
+    lower_to_aie_pipeline="objectFifo",
     function_name=None,
     seed=1,
     rtol=1e-6,

diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh
@@ -555,16 +555,17 @@ run_matmul_test \
 # MLIR-AIR Matmul tests
 ###################################################################
 
-if [ -d "$VITIS" ]; then
-  run_matmul_test \
-      --name_prefix "ukern" \
-      --lower_to_aie_pipeline "air" \
-      --tile_pipeline "pad-pack" \
-      --lhs_rhs_type "bf16" \
-      --acc_type "f32" \
-      --m "256"  --k "256" --n "256" \
-      --use_ukernel "1"
-fi
+# TODO: re-enable after fixing in AIR
+# if [ -d "$VITIS" ]; then
+#   run_matmul_test \
+#       --name_prefix "ukern" \
+#       --lower_to_aie_pipeline "air" \
+#       --tile_pipeline "pad-pack" \
+#       --lhs_rhs_type "bf16" \
+#       --acc_type "f32" \
+#       --m "256"  --k "256" --n "256" \
+#       --use_ukernel "1"
+# fi
 
 # Example of a run with a group of 2+ matmuls. Currently this test is passed
 # the flag '--num_repeat_runs 0" as there is currently an issue with the runtime if

diff --git a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
@@ -11,6 +11,7 @@
 // it compatible with the available vector instructions in AIE architectures
 //===----------------------------------------------------------------------===//
 
+#include <limits>
 #include <memory>
 
 #include "Passes.h"
@@ -37,7 +38,268 @@
 
 namespace mlir::iree_compiler::aievec {
 
-using namespace mlir;
+namespace copied_from_mlir {
+
+/// This code is copied from MLIR. It adds a single-line change in 2 patterns,
+/// FlattenContiguousRowMajorTransfer[Read|Write]Pattern. The change allows the
+/// memrefs of the transfer operations to be flattened to 1D memrefs, ie not
+/// only the vectors are flattened. TODO(newling) consider upstreaming to reduce
+/// code dup.
+
+/// Creates a memref.collapse_shape collapsing all inner dimensions of the
+/// input starting at `firstDimToCollapse`.
+static Value collapseInnerDims(PatternRewriter &rewriter, mlir::Location loc,
+                               Value input, int64_t firstDimToCollapse) {
+  ShapedType inputType = cast<ShapedType>(input.getType());
+  if (inputType.getRank() == 1) return input;
+  SmallVector<ReassociationIndices> reassociation;
+  for (int64_t i = 0; i < firstDimToCollapse; ++i)
+    reassociation.push_back(ReassociationIndices{i});
+  ReassociationIndices collapsedIndices;
+  for (int64_t i = firstDimToCollapse; i < inputType.getRank(); ++i)
+    collapsedIndices.push_back(i);
+  reassociation.push_back(collapsedIndices);
+  return rewriter.create<memref::CollapseShapeOp>(loc, input, reassociation);
+}
+
+/// Returns the new indices that collapses the inner dimensions starting from
+/// the `firstDimToCollapse` dimension.
+static SmallVector<Value> getCollapsedIndices(RewriterBase &rewriter,
+                                              Location loc,
+                                              ArrayRef<int64_t> shape,
+                                              ValueRange indices,
+                                              int64_t firstDimToCollapse) {
+  assert(firstDimToCollapse < static_cast<int64_t>(indices.size()));
+
+  // If all the collapsed indices are zero then no extra logic is needed.
+  // Otherwise, a new offset/index has to be computed.
+  SmallVector<Value> indicesAfterCollapsing(
+      indices.begin(), indices.begin() + firstDimToCollapse);
+  SmallVector<Value> indicesToCollapse(indices.begin() + firstDimToCollapse,
+                                       indices.end());
+  if (llvm::all_of(indicesToCollapse, isZeroIndex)) {
+    indicesAfterCollapsing.push_back(indicesToCollapse[0]);
+    return indicesAfterCollapsing;
+  }
+
+  // Compute the remaining trailing index/offset required for reading from
+  // the collapsed memref:
+  //
+  //    offset = 0
+  //    for (i = firstDimToCollapse; i < outputRank; ++i)
+  //      offset += sourceType.getDimSize(i) * transferReadOp.indices[i]
+  //
+  // For this example:
+  //   %2 = vector.transfer_read/write %arg4[%c0, %arg0, %c0] (...) :
+  //      memref<1x43x2xi32>, vector<1x2xi32>
+  // which would be collapsed to:
+  //   %1 = vector.transfer_read/write %collapse_shape[%c0, %offset] (...) :
+  //      memref<1x86xi32>, vector<2xi32>
+  // one would get the following offset:
+  //    %offset = %arg0 * 43
+  OpFoldResult collapsedOffset =
+      rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult();
+
+  auto collapsedStrides = computeSuffixProduct(
+      ArrayRef<int64_t>(shape.begin() + firstDimToCollapse, shape.end()));
+
+  // Compute the collapsed offset.
+  auto &&[collapsedExpr, collapsedVals] =
+      computeLinearIndex(collapsedOffset, collapsedStrides, indicesToCollapse);
+  collapsedOffset = affine::makeComposedFoldedAffineApply(
+      rewriter, loc, collapsedExpr, collapsedVals);
+
+  if (collapsedOffset.is<Value>()) {
+    indicesAfterCollapsing.push_back(collapsedOffset.get<Value>());
+  } else {
+    indicesAfterCollapsing.push_back(rewriter.create<arith::ConstantIndexOp>(
+        loc, *getConstantIntValue(collapsedOffset)));
+  }
+
+  return indicesAfterCollapsing;
+}
+
+/// Rewrites contiguous row-major vector.transfer_read ops by inserting
+/// memref.collapse_shape on the source so that the resulting
+/// vector.transfer_read has a 1D source. Requires the source shape to be
+/// already reduced i.e. without unit dims.
+///
+/// If `targetVectorBitwidth` is provided, the flattening will only happen if
+/// the trailing dimension of the vector read is smaller than the provided
+/// bitwidth.
+class FlattenContiguousRowMajorTransferReadPattern
+    : public OpRewritePattern<vector::TransferReadOp> {
+ public:
+  FlattenContiguousRowMajorTransferReadPattern(MLIRContext *context,
+                                               unsigned vectorBitwidth,
+                                               PatternBenefit benefit)
+      : OpRewritePattern<vector::TransferReadOp>(context, benefit),
+        targetVectorBitwidth(vectorBitwidth) {}
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp transferReadOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = transferReadOp.getLoc();
+    Value vector = transferReadOp.getVector();
+    VectorType vectorType = cast<VectorType>(vector.getType());
+    auto source = transferReadOp.getSource();
+    MemRefType sourceType = dyn_cast<MemRefType>(source.getType());
+
+    // 0. Check pre-conditions
+    // Contiguity check is valid on tensors only.
+    if (!sourceType) return failure();
+    // If this is already 0D/1D, there's nothing to do.
+    if (vectorType.getRank() <= 1) return failure();
+    if (!vectorType.getElementType().isSignlessIntOrFloat()) return failure();
+    unsigned trailingVectorDimBitwidth =
+        vectorType.getShape().back() * vectorType.getElementTypeBitWidth();
+    if (trailingVectorDimBitwidth >= targetVectorBitwidth) return failure();
+    if (!vector::isContiguousSlice(sourceType, vectorType)) return failure();
+    // TODO: generalize this pattern, relax the requirements here.
+    if (transferReadOp.hasOutOfBoundsDim()) return failure();
+    if (!transferReadOp.getPermutationMap().isMinorIdentity()) return failure();
+    if (transferReadOp.getMask()) return failure();
+
+    // TODO(newling) This is the one line which changes from the original
+    // MLIR function. Upstream this as an option to flatten the memref (and
+    // not just the vector).
+    // int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank();
+    int64_t firstDimToCollapse = 0;
+
+    // 1. Collapse the source memref
+    Value collapsedSource =
+        collapseInnerDims(rewriter, loc, source, firstDimToCollapse);
+    MemRefType collapsedSourceType =
+        cast<MemRefType>(collapsedSource.getType());
+    int64_t collapsedRank = collapsedSourceType.getRank();
+    assert(collapsedRank == firstDimToCollapse + 1);
+
+    // 2. Generate input args for a new vector.transfer_read that will read
+    // from the collapsed memref.
+    // 2.1. New dim exprs + affine map
+    SmallVector<AffineExpr, 1> dimExprs{
+        getAffineDimExpr(firstDimToCollapse, rewriter.getContext())};
+    auto collapsedMap =
+        AffineMap::get(collapsedRank, 0, dimExprs, rewriter.getContext());
+
+    // 2.2 New indices
+    SmallVector<Value> collapsedIndices =
+        getCollapsedIndices(rewriter, loc, sourceType.getShape(),
+                            transferReadOp.getIndices(), firstDimToCollapse);
+
+    // 3. Create new vector.transfer_read that reads from the collapsed memref
+    VectorType flatVectorType = VectorType::get({vectorType.getNumElements()},
+                                                vectorType.getElementType());
+    vector::TransferReadOp flatRead = rewriter.create<vector::TransferReadOp>(
+        loc, flatVectorType, collapsedSource, collapsedIndices, collapsedMap);
+    flatRead.setInBoundsAttr(rewriter.getBoolArrayAttr({true}));
+
+    // 4. Replace the old transfer_read with the new one reading from the
+    // collapsed shape
+    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
+        transferReadOp, cast<VectorType>(vector.getType()), flatRead);
+    return success();
+  }
+
+ private:
+  // Minimum bitwidth that the trailing vector dimension should have after
+  // flattening.
+  unsigned targetVectorBitwidth;
+};
+
+/// Rewrites contiguous row-major vector.transfer_write ops by inserting
+/// memref.collapse_shape on the source so that the resulting
+/// vector.transfer_write has a 1D source. Requires the source shape to be
+/// already reduced i.e. without unit dims.
+///
+/// If `targetVectorBitwidth` is provided, the flattening will only happen if
+/// the trailing dimension of the vector read is smaller than the provided
+/// bitwidth.
+class FlattenContiguousRowMajorTransferWritePattern
+    : public OpRewritePattern<vector::TransferWriteOp> {
+ public:
+  FlattenContiguousRowMajorTransferWritePattern(MLIRContext *context,
+                                                unsigned vectorBitwidth,
+                                                PatternBenefit benefit)
+      : OpRewritePattern<vector::TransferWriteOp>(context, benefit),
+        targetVectorBitwidth(vectorBitwidth) {}
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp transferWriteOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = transferWriteOp.getLoc();
+    Value vector = transferWriteOp.getVector();
+    VectorType vectorType = cast<VectorType>(vector.getType());
+    Value source = transferWriteOp.getSource();
+    MemRefType sourceType = dyn_cast<MemRefType>(source.getType());
+
+    // 0. Check pre-conditions
+    // Contiguity check is valid on tensors only.
+    if (!sourceType) return failure();
+    // If this is already 0D/1D, there's nothing to do.
+    if (vectorType.getRank() <= 1)
+      // Already 0D/1D, nothing to do.
+      return failure();
+    if (!vectorType.getElementType().isSignlessIntOrFloat()) return failure();
+    unsigned trailingVectorDimBitwidth =
+        vectorType.getShape().back() * vectorType.getElementTypeBitWidth();
+    if (trailingVectorDimBitwidth >= targetVectorBitwidth) return failure();
+    if (!vector::isContiguousSlice(sourceType, vectorType)) return failure();
+    // TODO: generalize this pattern, relax the requirements here.
+    if (transferWriteOp.hasOutOfBoundsDim()) return failure();
+    if (!transferWriteOp.getPermutationMap().isMinorIdentity())
+      return failure();
+    if (transferWriteOp.getMask()) return failure();
+
+    // TODO(newling) This is the one line which changes from the original
+    // MLIR function. Upstream this as an option to flatten the memref (and
+    // not just the vector).
+    // int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank();
+    int64_t firstDimToCollapse = 0;
+
+    // 1. Collapse the source memref
+    Value collapsedSource =
+        collapseInnerDims(rewriter, loc, source, firstDimToCollapse);
+    MemRefType collapsedSourceType =
+        cast<MemRefType>(collapsedSource.getType());
+    int64_t collapsedRank = collapsedSourceType.getRank();
+    assert(collapsedRank == firstDimToCollapse + 1);
+
+    // 2. Generate input args for a new vector.transfer_read that will read
+    // from the collapsed memref.
+    // 2.1. New dim exprs + affine map
+    SmallVector<AffineExpr, 1> dimExprs{
+        getAffineDimExpr(firstDimToCollapse, rewriter.getContext())};
+    auto collapsedMap =
+        AffineMap::get(collapsedRank, 0, dimExprs, rewriter.getContext());
+
+    // 2.2 New indices
+    SmallVector<Value> collapsedIndices =
+        getCollapsedIndices(rewriter, loc, sourceType.getShape(),
+                            transferWriteOp.getIndices(), firstDimToCollapse);
+
+    // 3. Create new vector.transfer_write that writes to the collapsed memref
+    VectorType flatVectorType = VectorType::get({vectorType.getNumElements()},
+                                                vectorType.getElementType());
+    Value flatVector =
+        rewriter.create<vector::ShapeCastOp>(loc, flatVectorType, vector);
+    vector::TransferWriteOp flatWrite =
+        rewriter.create<vector::TransferWriteOp>(
+            loc, flatVector, collapsedSource, collapsedIndices, collapsedMap);
+    flatWrite.setInBoundsAttr(rewriter.getBoolArrayAttr({true}));
+
+    // 4. Replace the old transfer_write with the new one writing the
+    // collapsed shape
+    rewriter.eraseOp(transferWriteOp);
+    return success();
+  }
+
+ private:
+  // Minimum bitwidth that the trailing vector dimension should have after
+  // flattening.
+  unsigned targetVectorBitwidth;
+};
+
+}  // namespace copied_from_mlir
+
 
 static bool isGemmBTransposedContractionOp(vector::ContractionOp op) {
   if (op.getKind() != vector::CombiningKind::ADD) return false;
@@ -643,7 +905,12 @@ struct CanonicalizeVectorForAIEVecPass
                ToMinorIdentityTransferWritePattern,
                ConvertLeadingUnitDimInsertToReshapePattern>(context);
       patterns.add<ConvertSplatTransferReadToBroadcastPattern>(context);
-      mlir::vector::populateFlattenVectorTransferPatterns(patterns);
+      patterns
+          .add<copied_from_mlir::FlattenContiguousRowMajorTransferReadPattern,
+               copied_from_mlir::FlattenContiguousRowMajorTransferWritePattern>(
+              context, std::numeric_limits<unsigned>::max(), 1);
+      mlir::vector::populateShapeCastFoldingPatterns(patterns);
+      mlir::vector::populateDropUnitDimWithShapeCastPatterns(patterns);
       mlir::vector::populateVectorBroadcastLoweringPatterns(patterns);
       (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
     }