diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp
new file mode 100644
index 000000000..6ba9acaec
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp
@@ -0,0 +1,50 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "iree-amdaie-combine-logical-objectfifos-for-connection-reuse"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+class AMDAIECombineLogicalObjFifosForConnectionReusePass
+    : public impl::AMDAIECombineLogicalObjFifosForConnectionReuseBase<
+          AMDAIECombineLogicalObjFifosForConnectionReusePass> {
+ public:
+  using AMDAIECombineLogicalObjFifosForConnectionReuseBase::
+      AMDAIECombineLogicalObjFifosForConnectionReuseBase;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+  void runOnOperation() override;
+};
+
+void AMDAIECombineLogicalObjFifosForConnectionReusePass::runOnOperation() {
+  ModuleOp moduleOp = getOperation();
+  MLIRContext *context = &getContext();
+  IRRewriter rewriter(context);
+
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps =
+      fetchDmaCpyNdOpsToSplitOrCombine(moduleOp);
+
+  if (failed(combineLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIECombineLogicalObjFifosForConnectionReusePass() {
+  return std::make_unique<AMDAIECombineLogicalObjFifosForConnectionReusePass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
index 8a0e65e1d..0a77f17dc 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
@@ -52,60 +52,6 @@ int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound,
 
 namespace {
 
-/// Utility affine expression visitor to retrieve the scale and optional bias
-/// from the expression.
-struct RetrieveScaleAndBias
-    : public AffineExprVisitor<RetrieveScaleAndBias, LogicalResult> {
-  std::optional<int64_t> scale;
-  std::optional<int64_t> bias;
-  LogicalResult visitAffineBinaryOpExpr(AffineBinaryOpExpr /*expr*/) {
-    return failure();
-  }
-  LogicalResult visitConstantExpr(AffineConstantExpr /*expr*/) {
-    return failure();
-  }
-  LogicalResult visitDimExpr(AffineDimExpr /*expr*/) { return failure(); }
-  LogicalResult visitSymbolExpr(AffineSymbolExpr /*expr*/) { return failure(); }
-  LogicalResult visitMulExpr(AffineBinaryOpExpr expr) {
-    if (auto rhsSize = dyn_cast<AffineConstantExpr>(expr.getRHS());
-        isa<AffineDimExpr>(expr.getLHS())) {
-      scale = rhsSize.getValue();
-    } else if (auto lhsSize = dyn_cast<AffineConstantExpr>(expr.getLHS());
-               isa<AffineDimExpr>(expr.getRHS())) {
-      scale = lhsSize.getValue();
-    }
-    return success();
-  }
-  LogicalResult visitAddExpr(AffineBinaryOpExpr expr) {
-    if (bias) return failure();
-    if (auto rhsSize = dyn_cast<AffineConstantExpr>(expr.getRHS())) {
-      bias = rhsSize.getValue();
-      if (bias.value() < 0) return failure();
-      if (isa<AffineBinaryOpExpr>(expr.getLHS())) {
-        return visit(expr.getLHS());
-      } else if (isa<AffineDimExpr>(expr.getLHS())) {
-        scale = 1;
-        return success();
-      } else {
-        return failure();
-      }
-    } else if (auto lhsSize = dyn_cast<AffineConstantExpr>(expr.getLHS())) {
-      bias = lhsSize.getValue();
-      if (bias.value() < 0) return failure();
-      if (isa<AffineBinaryOpExpr>(expr.getRHS())) {
-        return visit(expr.getRHS());
-      } else if (isa<AffineDimExpr>(expr.getRHS())) {
-        scale = 1;
-        return success();
-      } else {
-        return failure();
-      }
-    } else {
-      return failure();
-    }
-  }
-};
-
 struct SubsumeLoopIntoDMA
     : public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
   using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
index e628cc739..81a0d6994 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
@@ -12,12 +12,71 @@
 #include "iree-amd-aie/IR/AMDAIEOps.h"
 #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 
 namespace mlir::iree_compiler::AMDAIE {
 
+/// Utility to retrieve a constant index from an OpFoldResult.
+int64_t getConstantIndexOrAssert(OpFoldResult dim);
+
+/// Utility affine expression visitor to retrieve the scale and optional bias
+/// from the expression.
+struct RetrieveScaleAndBias
+    : public AffineExprVisitor<RetrieveScaleAndBias, LogicalResult> {
+  std::optional<int64_t> scale;
+  std::optional<int64_t> bias;
+  LogicalResult visitAffineBinaryOpExpr(AffineBinaryOpExpr /*expr*/) {
+    return failure();
+  }
+  LogicalResult visitConstantExpr(AffineConstantExpr /*expr*/) {
+    return failure();
+  }
+  LogicalResult visitDimExpr(AffineDimExpr /*expr*/) { return failure(); }
+  LogicalResult visitSymbolExpr(AffineSymbolExpr /*expr*/) { return failure(); }
+  LogicalResult visitMulExpr(AffineBinaryOpExpr expr) {
+    if (auto rhsSize = dyn_cast<AffineConstantExpr>(expr.getRHS());
+        isa<AffineDimExpr>(expr.getLHS())) {
+      scale = rhsSize.getValue();
+    } else if (auto lhsSize = dyn_cast<AffineConstantExpr>(expr.getLHS());
+               isa<AffineDimExpr>(expr.getRHS())) {
+      scale = lhsSize.getValue();
+    }
+    return success();
+  }
+  LogicalResult visitAddExpr(AffineBinaryOpExpr expr) {
+    if (bias) return failure();
+    if (auto rhsSize = dyn_cast<AffineConstantExpr>(expr.getRHS())) {
+      bias = rhsSize.getValue();
+      if (bias.value() < 0) return failure();
+      if (isa<AffineBinaryOpExpr>(expr.getLHS())) {
+        return visit(expr.getLHS());
+      } else if (isa<AffineDimExpr>(expr.getLHS())) {
+        scale = 1;
+        return success();
+      } else {
+        return failure();
+      }
+    } else if (auto lhsSize = dyn_cast<AffineConstantExpr>(expr.getLHS())) {
+      bias = lhsSize.getValue();
+      if (bias.value() < 0) return failure();
+      if (isa<AffineBinaryOpExpr>(expr.getRHS())) {
+        return visit(expr.getRHS());
+      } else if (isa<AffineDimExpr>(expr.getRHS())) {
+        scale = 1;
+        return success();
+      } else {
+        return failure();
+      }
+    } else {
+      return failure();
+    }
+  }
+};
+
 // Constant specifying the number of inter-iteration dimension for DMA
 // operations.
 //
@@ -194,9 +253,9 @@ struct DmaDimConfig {
   AMDAIE::AMDAIETileType sourceTileType;
   AMDAIE::AMDAIETileType targetTileType;
   /// The maximum number of addressing dimensions on the source side of the DMA.
-  uint8_t sourceMaxNbDims{0};
+  int64_t sourceMaxNbDims{0};
   /// The maximum number of addressing dimensions on the target side of the DMA.
-  uint8_t targetMaxNbDims{0};
+  int64_t targetMaxNbDims{0};
 
   DmaDimConfig(const AMDAIE::AMDAIEDeviceModel &deviceModel,
                uint8_t sourceMemspaceInt, uint8_t targetMemspaceInt)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
index 6b2fda49e..2ebbabda6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -8,6 +8,7 @@
 
 #include <numeric>
 
+#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -17,10 +18,66 @@
 #include "mlir/IR/Iterators.h"
 #include "mlir/IR/Operation.h"
 
+///////////////////////////////////////////////////
+#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
+#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
+///////////////////////////////////////////////////
+
 #define DEBUG_TYPE "iree-amdaie-logicalobjfifo-splitting-utils"
 
 namespace mlir::iree_compiler::AMDAIE {
 
+/// Utility to create a new logical objectfifo based on shape defined by
+/// `newSizesOpFoldResultArr`.
+static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo(
+    IRRewriter &rewriter,
+    AMDAIE::LogicalObjectFifoFromMemrefOp &oldLogicalObjectFifo,
+    SmallVector<OpFoldResult> &newSizesOpFoldResultArr) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  SmallVector<int64_t> newSizes;
+  for (OpFoldResult sizeVal : newSizesOpFoldResultArr) {
+    newSizes.push_back(getConstantIndexOrAssert(sizeVal));
+  }
+  Value oldAllocOp = oldLogicalObjectFifo.getMemref();
+  auto oldMemRefType = cast<MemRefType>(oldAllocOp.getType());
+  MemRefType newAllocType = MemRefType::get(
+      newSizes, oldMemRefType.getElementType(), MemRefLayoutAttrInterface{},
+      oldMemRefType.getMemorySpace());
+  assert(oldAllocOp.getDefiningOp() && "expected a defining op for the value");
+  rewriter.setInsertionPoint(oldAllocOp.getDefiningOp());
+  auto newAllocOp =
+      rewriter.create<memref::AllocOp>(rewriter.getUnknownLoc(), newAllocType);
+  auto newDeallocOp =
+      rewriter.create<memref::DeallocOp>(rewriter.getUnknownLoc(), newAllocOp);
+  newDeallocOp->moveBefore(&newAllocOp->getBlock()->back());
+  auto type = cast<MemRefType>(newAllocOp.getType());
+  // Create new logical objectfifo.
+  rewriter.setInsertionPoint(oldLogicalObjectFifo);
+  auto newLogicalObjectFifo =
+      rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+          rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type),
+          newAllocOp.getResult(), oldLogicalObjectFifo.getTiles());
+  return newLogicalObjectFifo;
+}
+
+/// Utility to help fetch those input DmaCpyNd Ops which needs to be split.
+SmallVector<AMDAIE::DmaCpyNdOp> fetchDmaCpyNdOpsToSplitOrCombine(
+    ModuleOp moduleOp) {
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps;
+  // We are currently walking through CoreOps gathering 3rd Input DmaOp (if
+  // applicable) from them.
+  // TODO(avarma): We will generalize this later.
+  moduleOp.walk([&](AMDAIE::CoreOp coreOp) {
+    SmallVector<Value> inputDmas = coreOp.getInputDmas();
+    if (inputDmas.size() != 3) return WalkResult::skip();
+    auto dmaCpyNdOp = inputDmas[2].getDefiningOp<AMDAIE::DmaCpyNdOp>();
+    assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op");
+    l2ToL1DmaOps.push_back(dmaCpyNdOp);
+    return WalkResult::advance();
+  });
+  return l2ToL1DmaOps;
+}
+
 /// Utility to verify that the split dimensions for L2 are contiguous.
 static LogicalResult checkIsRangeFromZero(
     SmallVector<size_t> &splitDimsSetForL2) {
@@ -124,6 +181,44 @@ static FailureOr<OpFoldResult> updateL3SourceOffset(IRRewriter &rewriter,
   return newL3AsSourceOffset;
 }
 
+/// Given a L2->L1 DmaCpyNd op, find the unique L3->L2 DmaCpyNd op.
+static FailureOr<AMDAIE::DmaCpyNdOp> fetchL3ToL2DmaCpyNdOp(
+    AMDAIE::DmaCpyNdOp l2ToL1DmaOp) {
+  LogicalObjectFifoFromMemrefOp sourceObjectFifo =
+      l2ToL1DmaOp.getSourceObjectFifo();
+  SmallVector<AMDAIE::DmaCpyNdOp> l3ToL2DmaOps;
+  AMDAIE::DmaCpyNdOp l3ToL2DmaOp;
+  for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) {
+    if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(objFifoUserOp);
+        dmaOp.getTargetObjectFifo() == sourceObjectFifo) {
+      l3ToL2DmaOps.push_back(dmaOp);
+    }
+  }
+  if (l3ToL2DmaOps.size() == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for "
+                            << sourceObjectFifo << "\n");
+    return failure();
+  }
+  if (l3ToL2DmaOps.size() > 1) {
+    LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for "
+                            << sourceObjectFifo << "\n");
+    return failure();
+  }
+  l3ToL2DmaOp = l3ToL2DmaOps[0];
+  if ((l3ToL2DmaOp.getTargetMixedOffsets().size() !=
+       l3ToL2DmaOp.getSourceMixedOffsets().size()) ||
+      (l3ToL2DmaOp.getTargetMixedSizes().size() !=
+       l3ToL2DmaOp.getSourceMixedSizes().size()) ||
+      (l3ToL2DmaOp.getTargetMixedStrides().size() !=
+       l3ToL2DmaOp.getSourceMixedStrides().size())) {
+    LLVM_DEBUG(llvm::dbgs() << "dimensionality of source and target's "
+                               "offset/size/stride found different for "
+                            << l3ToL2DmaOp << "\n");
+    return failure();
+  }
+  return l3ToL2DmaOp;
+}
+
 /// A struct utility to encapsulate all the data required to perform splitting
 /// of logicalobjectfifos.
 struct SplittingLogicalObjectFifoData {
@@ -186,36 +281,10 @@ static LogicalResult checkWhetherSplitIsPossible(
   }
 
   // Fetch the L3 -> L2 Dma Op corresponding to the L2 buffer as target.
-  SmallVector<AMDAIE::DmaCpyNdOp> l3ToL2DmaOps;
-  AMDAIE::DmaCpyNdOp l3ToL2DmaOp;
-  for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) {
-    if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(objFifoUserOp);
-        dmaOp.getTargetObjectFifo() == sourceObjectFifo) {
-      l3ToL2DmaOps.push_back(dmaOp);
-    }
-  }
-  if (l3ToL2DmaOps.size() == 0) {
-    LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for "
-                            << sourceObjectFifo << "\n");
-    return failure();
-  }
-  if (l3ToL2DmaOps.size() > 1) {
-    LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for "
-                            << sourceObjectFifo << "\n");
-    return failure();
-  }
-  l3ToL2DmaOp = l3ToL2DmaOps[0];
-  if ((l3ToL2DmaOp.getTargetMixedOffsets().size() !=
-       l3ToL2DmaOp.getSourceMixedOffsets().size()) ||
-      (l3ToL2DmaOp.getTargetMixedSizes().size() !=
-       l3ToL2DmaOp.getSourceMixedSizes().size()) ||
-      (l3ToL2DmaOp.getTargetMixedStrides().size() !=
-       l3ToL2DmaOp.getSourceMixedStrides().size())) {
-    LLVM_DEBUG(llvm::dbgs() << "dimensionality of source and target's "
-                               "offset/size/stride found different for "
-                            << l3ToL2DmaOp << "\n");
-    return failure();
-  }
+  FailureOr<AMDAIE::DmaCpyNdOp> maybeL3ToL2DmaOp =
+      fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[0]);
+  if (failed(maybeL3ToL2DmaOp)) return failure();
+  AMDAIE::DmaCpyNdOp l3ToL2DmaOp = maybeL3ToL2DmaOp.value();
 
   SmallVector<OpFoldResult, 4> staticL2AsTargetSizes =
       l3ToL2DmaOp.getTargetMixedSizes();
@@ -289,16 +358,13 @@ LogicalResult splitLogicalObjectFifos(
   toBeErased.insert(sourceAllocOp);
   toBeErased.insert(sourceObjectFifo);
 
-  SmallVector<OpFoldResult, 4> staticL2AsTargetOffsets =
+  SmallVector<OpFoldResult> staticL2AsTargetOffsets =
       l3ToL2DmaOp.getTargetMixedOffsets();
-  SmallVector<OpFoldResult, 4> staticL2AsTargetSizes =
+  SmallVector<OpFoldResult> staticL2AsTargetSizes =
       l3ToL2DmaOp.getTargetMixedSizes();
-  SmallVector<int64_t, 4> l2ShapeAsTarget = llvm::to_vector(
-      cast<MemRefType>(l3ToL2DmaOp.getTargetObjectFifo().getMemref().getType())
-          .getShape());
-  SmallVector<OpFoldResult, 4> staticL3AsSourceOffsets =
+  SmallVector<OpFoldResult> staticL3AsSourceOffsets =
       l3ToL2DmaOp.getSourceMixedOffsets();
-  SmallVector<OpFoldResult, 4> staticL3AsSourceSizes =
+  SmallVector<OpFoldResult> staticL3AsSourceSizes =
       l3ToL2DmaOp.getSourceMixedSizes();
   OpFoldResult zeroVal = getAsIndexOpFoldResult(context, 0);
   OpFoldResult oneVal = getAsIndexOpFoldResult(context, 1);
@@ -310,45 +376,28 @@ LogicalResult splitLogicalObjectFifos(
     staticL2AsTargetSizes[dim] = oneVal;
     staticL3AsSourceOffsets[dim] = zeroVal;
     staticL3AsSourceSizes[dim] = oneVal;
-    l2ShapeAsTarget[dim] = 1;
   }
 
   // Traverse each L2->L1 DmaCpyNd op and split them.
   for (AMDAIE::DmaCpyNdOp l2ToL1DmaOp : l2ToL1DmaOps) {
-    SmallVector<OpFoldResult, 6> staticL2AsSourceOffsets =
+    SmallVector<OpFoldResult> staticL2AsSourceOffsets =
         l2ToL1DmaOp.getSourceMixedOffsets();
-    SmallVector<OpFoldResult, 6> staticL2AsSourceSizes =
+    SmallVector<OpFoldResult> staticL2AsSourceSizes =
         l2ToL1DmaOp.getSourceMixedSizes();
 
     // Now we'll create a new L2 buffer based on the new shape inferred earlier
-    // via `l2ShapeAsTarget`.
-    rewriter.setInsertionPoint(sourceAllocOp);
-    LogicalObjectFifoFromMemrefOp targetObjectFifo =
-        l2ToL1DmaOp.getTargetObjectFifo();
-    Value targetAllocOp = targetObjectFifo.getMemref();
-    auto oldSourceMemRefType = cast<MemRefType>(sourceAllocOp.getType());
-    auto targetMemRefType = cast<MemRefType>(targetAllocOp.getType());
-    MemRefType newAllocType = MemRefType::get(
-        l2ShapeAsTarget, targetMemRefType.getElementType(),
-        MemRefLayoutAttrInterface{}, oldSourceMemRefType.getMemorySpace());
-    auto newAllocOp = rewriter.create<memref::AllocOp>(rewriter.getUnknownLoc(),
-                                                       newAllocType);
-    auto newDeallocOp = rewriter.create<memref::DeallocOp>(
-        rewriter.getUnknownLoc(), newAllocOp);
-    newDeallocOp->moveBefore(&newAllocOp->getBlock()->back());
-    auto type = cast<MemRefType>(newAllocOp.getType());
-    // Create new logicalobjectfifo.from_memref for the newly created L2 buffer.
-    rewriter.setInsertionPoint(l2ToL1DmaOp.getSourceObjectFifo());
-    auto source = rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-        rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type),
-        newAllocOp.getResult(), sourceObjectFifo.getTiles());
+    // via `staticL2AsTargetSizes`.
+    LogicalObjectFifoFromMemrefOp oldL2ObjectFifo =
+        l2ToL1DmaOp.getSourceObjectFifo();
+    AMDAIE::LogicalObjectFifoFromMemrefOp source = createNewLogicalObjectFifo(
+        rewriter, oldL2ObjectFifo, staticL2AsTargetSizes);
 
     // --------------------------------------------
     // ---------- L3 -> L2 splitting --------------
     // --------------------------------------------
     // Update L3 source offsets for non-split dimensions. Refer doc comment of
     // `updateL3SourceOffset` for the computation rationale involved.
-    SmallVector<OpFoldResult, 4> staticL3AsSourceOffsets =
+    SmallVector<OpFoldResult> staticL3AsSourceOffsets =
         l3ToL2DmaOp.getSourceMixedOffsets();
     for (auto &&[splitDim, nonSplitdim] :
          llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) {
@@ -427,4 +476,386 @@ LogicalResult splitLogicalObjectFifos(
   return success();
 }
 
+static int64_t fetchOffsetBias(OpFoldResult offsetOpFoldResult) {
+  std::optional<int64_t> offset = getConstantIntValue(offsetOpFoldResult);
+  if (offset) return offset.value();
+  auto offsetVal = cast<Value>(offsetOpFoldResult);
+  auto affineApplyOp =
+      dyn_cast_if_present<affine::AffineApplyOp>(offsetVal.getDefiningOp());
+  if (!affineApplyOp) return 0;
+  AffineMap affineMap = affineApplyOp.getAffineMap();
+  RetrieveScaleAndBias retriever;
+  assert(!failed(retriever.visit(affineMap.getResult(0))) &&
+         "failed to retrieve scale and bias");
+  int64_t bias = 0;
+  if (retriever.bias) {
+    bias = retriever.bias.value();
+  }
+  return bias;
+}
+
+static LogicalResult combineL3ToL2AccessPatterns(
+    RewriterBase &rewriter, const SmallVector<OpFoldResult> &offsetsA,
+    const SmallVector<OpFoldResult> &sizesA,
+    const SmallVector<OpFoldResult> &stridesA,
+    const SmallVector<OpFoldResult> &offsetsB,
+    const SmallVector<OpFoldResult> &sizesB,
+    const SmallVector<OpFoldResult> &stridesB,
+    SmallVector<OpFoldResult> &newOffsets, SmallVector<OpFoldResult> &newSizes,
+    SmallVector<OpFoldResult> &newStrides, SmallVector<int64_t> &splitDims,
+    SmallVector<int64_t> &nonSplitDims) {
+  if (offsetsA.empty() && offsetsB.empty()) return success();
+
+  int64_t newSize = 1;
+  for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) {
+    if (iter.index() < splitDims.size()) continue;
+    const OpFoldResult &offsetA = std::get<0>(iter.value());
+    const OpFoldResult &offsetB = std::get<1>(iter.value());
+    if (offsetA != offsetB) {
+      // Need to check the difference in bias here.
+      int64_t biasA = fetchOffsetBias(offsetA);
+      int64_t biasB = fetchOffsetBias(offsetB);
+      std::optional<int64_t> sizeA = getConstantIntValue(sizesA[iter.index()]);
+      assert(sizeA && "expected a constant integer value for size");
+      assert((sizeA == biasB - biasA) &&
+             "L3->L2 pair cannot be combined because offset is not contiguous");
+      newSize++;
+    }
+  }
+  newSizes[splitDims.size() - 1] = rewriter.getI64IntegerAttr(newSize);
+  return success();
+}
+
+static FailureOr<LogicalObjectFifoFromMemrefOp> combineL3ToL2Pair(
+    IRRewriter &rewriter, DmaCpyNdOp dmaOpA, DmaCpyNdOp dmaOpB,
+    SmallVector<int64_t> &splitDims, SmallVector<int64_t> &nonSplitDims) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  SmallVector<OpFoldResult> sourceOffsetsA = dmaOpA.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = dmaOpA.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesA = dmaOpA.getSourceMixedStrides();
+  SmallVector<OpFoldResult> sourceOffsetsB = dmaOpB.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = dmaOpB.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesB = dmaOpB.getSourceMixedStrides();
+
+  SmallVector<OpFoldResult> targetOffsetsA = dmaOpA.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesA = dmaOpA.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesA = dmaOpA.getTargetMixedStrides();
+  SmallVector<OpFoldResult> targetOffsetsB = dmaOpB.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesB = dmaOpB.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesB = dmaOpB.getTargetMixedStrides();
+
+  SmallVector<OpFoldResult> newSourceOffsets = sourceOffsetsA;
+  SmallVector<OpFoldResult> newSourceSizes = sourceSizesA;
+  SmallVector<OpFoldResult> newSourceStrides = sourceStridesA;
+  if (failed(combineL3ToL2AccessPatterns(
+          rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA,
+          sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets,
+          newSourceSizes, newSourceStrides, splitDims, nonSplitDims))) {
+    return failure();
+  }
+
+  SmallVector<OpFoldResult> newTargetOffsets = targetOffsetsA;
+  SmallVector<OpFoldResult> newTargetSizes = newSourceSizes;
+  SmallVector<OpFoldResult> newTargetStrides = targetStridesA;
+  // Now we need to create a new L2 buffer based on `newTargetSizes`.
+  LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = dmaOpA.getTargetObjectFifo();
+  AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
+      createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newTargetSizes);
+
+  // Create combined L3->L2 Dma.
+  rewriter.setInsertionPoint(dmaOpA);
+  auto combinedL3ToL2DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+      dmaOpA.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets),
+      llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides),
+      dmaOpA.getSource(), llvm::ArrayRef(newSourceOffsets),
+      llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides));
+  // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma
+  // and erase the 1st L3->L2 Dma.
+  rewriter.replaceOp(dmaOpB, combinedL3ToL2DmaOp);
+  rewriter.eraseOp(dmaOpA);
+  return newL2ObjectFifo;
+}
+
+/// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op.
+static CoreOp fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp) {
+  SmallVector<CoreOp> coreOps;
+  for (Operation *userOp : l2ToL1DmaOp->getUsers()) {
+    if (auto coreOp = dyn_cast<CoreOp>(userOp)) {
+      coreOps.push_back(coreOp);
+    }
+  }
+  assert(coreOps.size() == 1 &&
+         "L2->L1 Dma op expected to have a unique Core op");
+  return coreOps[0];
+}
+
+static bool compareL3ToL2DmaPair(DmaCpyNdOp &a, DmaCpyNdOp &b) {
+  SmallVector<OpFoldResult> sourceOffsetsA = a.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = a.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceOffsetsB = b.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = b.getSourceMixedSizes();
+  // We'll add assertion checks on the size before invoking this function.
+  for (int64_t i = 0, n = sourceOffsetsA.size(); i < n; i++) {
+    std::optional<int64_t> offsetA = getConstantIntValue(sourceOffsetsA[i]);
+    std::optional<int64_t> offsetB = getConstantIntValue(sourceOffsetsB[i]);
+    if (offsetA && offsetB) {
+      if (offsetA < offsetB) return true;
+      if (offsetA > offsetB) return false;
+      continue;
+    }
+    if (!offsetA && !offsetB) {
+      auto offsetValA = cast<Value>(sourceOffsetsA[i]);
+      auto offsetValB = cast<Value>(sourceOffsetsB[i]);
+      auto affineApplyOpA = dyn_cast_if_present<affine::AffineApplyOp>(
+          offsetValA.getDefiningOp());
+      auto affineApplyOpB = dyn_cast_if_present<affine::AffineApplyOp>(
+          offsetValB.getDefiningOp());
+      // TODO(avarma): This should be handled better. The overall possibility
+      // here already makes this complex enough.
+      assert(affineApplyOpA && "expected affine.apply op");
+      assert(affineApplyOpB && "expected affine.apply op");
+      for (auto &&[valA, valB] :
+           llvm::zip_equal(affineApplyOpA.getMapOperands(),
+                           affineApplyOpB.getMapOperands())) {
+        assert((valA == valB) &&
+               "different base values being operated on between the L3->L2 Dma "
+               "op pair");
+      }
+      AffineMap affineMapA = affineApplyOpA.getAffineMap();
+      AffineMap affineMapB = affineApplyOpB.getAffineMap();
+      RetrieveScaleAndBias retrieverA, retrieverB;
+      assert(!failed(retrieverA.visit(affineMapA.getResult(0))) &&
+             "failed to retrieve scale and bias");
+      assert(!failed(retrieverB.visit(affineMapB.getResult(0))) &&
+             "failed to retrieve scale and bias");
+      int64_t biasA = 0, biasB = 0;
+      if (retrieverA.bias) {
+        biasA = retrieverA.bias.value();
+      }
+      if (retrieverB.bias) {
+        biasB = retrieverB.bias.value();
+      }
+      // TODO(avarma): We should also check the scale value as well.
+      if (biasA < biasB) return true;
+      if (biasA > biasB) return false;
+      continue;
+    }
+    assert(false &&
+           "unexpected combination of offset val amongst L3->L2 Dma pair");
+  }
+  return false;
+}
+
+static LogicalResult checkIfSameDimensionalityAccessPatterns(
+    AMDAIE::DmaCpyNdOp &l3ToL2DmaOpA, AMDAIE::DmaCpyNdOp &l3ToL2DmaOpB) {
+  SmallVector<OpFoldResult> sourceOffsetsA =
+      l3ToL2DmaOpA.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = l3ToL2DmaOpA.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesA =
+      l3ToL2DmaOpA.getSourceMixedStrides();
+  SmallVector<OpFoldResult> sourceOffsetsB =
+      l3ToL2DmaOpB.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = l3ToL2DmaOpB.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesB =
+      l3ToL2DmaOpB.getSourceMixedStrides();
+  if (sourceOffsetsA.size() != sourceOffsetsB.size() ||
+      sourceSizesA.size() != sourceSizesB.size() ||
+      sourceStridesA.size() != sourceStridesB.size() ||
+      sourceOffsetsA.size() != sourceSizesA.size() ||
+      sourceOffsetsA.size() != sourceStridesB.size()) {
+    return failure();
+  }
+  return success();
+}
+
+/// Given a vector of L2->L1 Dma Ops, combine the corresponding L3->L2 Dma Ops
+/// and reuse the L2/L1 buffers.
+/// TODO(avarma): Assign combined tiles while forming L2/L1 buffers which we'll
+/// reuse.
+LogicalResult combineLogicalObjectFifos(
+    IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
+    MLIRContext *context) {
+  if (l2ToL1DmaOps.size() == 0) return success();
+
+  // Fetch the L3 -> L2 Dma Op corresponding to the first L2 buffer as target.
+  SmallVector<AMDAIE::DmaCpyNdOp> l3ToL2DmaOps;
+  FailureOr<AMDAIE::DmaCpyNdOp> maybeL3ToL2DmaOp =
+      fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[0]);
+  if (failed(maybeL3ToL2DmaOp)) return failure();
+  l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value());
+
+  // Check that all L3 buffer associated with the different L3->L2 Dma ops are
+  // same.
+  for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) {
+    maybeL3ToL2DmaOp = fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[i]);
+    if (failed(maybeL3ToL2DmaOp)) return failure();
+    l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value());
+    if (l3ToL2DmaOps[0].getSourceObjectFifo() !=
+        l3ToL2DmaOps[i].getSourceObjectFifo()) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Found different L3 objectFifo for " << l3ToL2DmaOps[0]
+                 << " and " << l3ToL2DmaOps[i] << "\n");
+      return failure();
+    }
+    if (failed(checkIfSameDimensionalityAccessPatterns(l3ToL2DmaOps[0],
+                                                       l3ToL2DmaOps[i]))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Found different dimensionality of access patterns\n");
+      return failure();
+    }
+  }
+
+  if (l2ToL1DmaOps.size() != l3ToL2DmaOps.size()) {
+    LLVM_DEBUG(
+        llvm::dbgs()
+        << "expected 1:1 correspondence between L3->L2 and L2->L1 Dma ops\n");
+    return failure();
+  }
+
+  // Fetch split/non-split dimensions. Currently we look for a continuous
+  // sequence of 0 offset dims with size as 1 to infer them as split dimensions.
+  DenseSet<size_t> splitDimsSetForL2;
+  SmallVector<size_t> splitDimsForL2;
+  size_t maxSplitDimIndex = 0;
+  for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i++) {
+    SmallVector<OpFoldResult> sourceOffsets =
+        l3ToL2DmaOps[i].getSourceMixedOffsets();
+    SmallVector<OpFoldResult> sourceSizes =
+        l3ToL2DmaOps[i].getSourceMixedSizes();
+    unsigned j = 0, m = sourceOffsets.size();
+    // Traverse through the i-th L3->L2 Dma op's source offset/size to find a
+    // continuous sequence of 0 offset dims with size as 1.
+    while (j < m) {
+      std::optional<int64_t> constantOffset =
+          getConstantIntValue(sourceOffsets[j]);
+      if (!constantOffset || constantOffset.value() != 0) {
+        break;
+      }
+      std::optional<int64_t> constantSize = getConstantIntValue(sourceSizes[j]);
+      if (!constantSize || constantSize.value() != 1) {
+        break;
+      }
+      j++;
+    }
+    if (i == 0) {
+      maxSplitDimIndex = j;
+    } else if (maxSplitDimIndex != j) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "incompatible split dimensions across L3->L2\n");
+      return failure();
+    }
+  }
+  SmallVector<int64_t> splitDims(maxSplitDimIndex);
+  std::iota(splitDims.begin(), splitDims.end(), 0);
+  SmallVector<int64_t> nonSplitDims(maxSplitDimIndex);
+  std::iota(nonSplitDims.begin(), nonSplitDims.end(), splitDims.size());
+
+  // At this point it's nice to perhaps just sort the L3->L2 Dma ops based on
+  // the "overlapping" offsets. And we'll sort the corresponding L2->L1 Dma ops
+  // accordingly.
+  for (int64_t i = 1, n = l3ToL2DmaOps.size(); i < n; i++) {
+    DmaCpyNdOp currL3ToL2DmaOp = l3ToL2DmaOps[i];
+    DmaCpyNdOp currL2ToL1DmaOp = l2ToL1DmaOps[i];
+    int64_t j = i - 1;
+    while (j >= 0 && compareL3ToL2DmaPair(currL3ToL2DmaOp, l3ToL2DmaOps[j])) {
+      l3ToL2DmaOps[j + 1] = l3ToL2DmaOps[j];
+      l2ToL1DmaOps[j + 1] = l2ToL1DmaOps[j];
+      j--;
+    }
+    l3ToL2DmaOps[j + 1] = currL3ToL2DmaOp;
+    l2ToL1DmaOps[j + 1] = currL2ToL1DmaOp;
+  }
+
+  // Currently we have 4 cores so there are two pairs of DmaCpyNds to combine.
+  // TODO(avarma): Revisit this later when we want to target more no. of cores.
+  if (l3ToL2DmaOps.size() % 2 != 0) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "found uneven L3->L2 ops for combining\n");
+    return failure();
+  }
+
+  auto createL2ToL1ForReuse =
+      [](IRRewriter &rewriter, DmaCpyNdOp &l2ToL1DmaOp,
+         LogicalObjectFifoFromMemrefOp &reuseL1Buffer,
+         LogicalObjectFifoFromMemrefOp &reuseL2Buffer,
+         SmallVector<OpFoldResult> &newL2SourceOffsets) -> DmaCpyNdOp {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(l2ToL1DmaOp);
+    auto newL2ToL1DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+        l2ToL1DmaOp.getLoc(), reuseL1Buffer,
+        l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(),
+        l2ToL1DmaOp.getTargetMixedStrides(), reuseL2Buffer,
+        llvm::ArrayRef(newL2SourceOffsets), l2ToL1DmaOp.getSourceMixedSizes(),
+        l2ToL1DmaOp.getSourceMixedStrides());
+    rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp);
+    return newL2ToL1DmaOp;
+  };
+  for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) {
+    // Step 1. Combine the picked L3->L2 DmaCpyNd pair.
+    FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
+        combineL3ToL2Pair(rewriter, l3ToL2DmaOps[i], l3ToL2DmaOps[i + 1],
+                          splitDims, nonSplitDims);
+    if (failed(maybeNewL2ObjectFifo)) return failure();
+    LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
+        maybeNewL2ObjectFifo.value();
+
+    // Step 2. We now have need to create two L2->L1 ops since the size has
+    // changed. But for this we first need to find the new offset for L2 as
+    // source.
+    // TODO: For now I'm hardcoding the offsets but later it'd just depend
+    // on split/non-split dimensions.
+    // Offset = 0,0
+    LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
+        l2ToL1DmaOps[i].getTargetObjectFifo();
+    SmallVector<OpFoldResult> newL2AsSourceOffsets =
+        l2ToL1DmaOps[i].getSourceMixedOffsets();
+    DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse(
+        rewriter, l2ToL1DmaOps[i], reuseL1LogicalObjectFifoOp, newL2ObjectFifo,
+        newL2AsSourceOffsets);
+    // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as
+    // the first L2->L1 Dma.
+    newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets();
+    newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
+    DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse(
+        rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp,
+        newL2ObjectFifo, newL2AsSourceOffsets);
+
+    // Step 3. PICK the CoreOps associated with the 1:1 L2->L1.
+    // For the first Core op we'll insert Read at the end. It doesn't matter
+    // for now so we're gonna insert it right before amdaie.end op.
+    CoreOp firstCoreOp = fetchUniqueCoreOp(newFirstL2ToL1DmaOp);
+    firstCoreOp.walk([&](AMDAIE::EndOp endOp) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      // Hardcoding to `AMDAIE::MemoryAccess::Read`.
+      rewriter.setInsertionPoint(endOp);
+      rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+          rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
+          AMDAIE::MemoryAccess::Read);
+    });
+    // For the second Core op we'll insert `Read` right before the first read
+    // from the corresponding L1 logicalobjectFifo.
+    CoreOp secondCoreOp = fetchUniqueCoreOp(newSecondL2ToL1DmaOp);
+    secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
+      if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) {
+        OpBuilder::InsertionGuard guard(rewriter);
+        // Hardcoding to `AMDAIE::MemoryAccess::Read`.
+        rewriter.setInsertionPoint(accessOp);
+        rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+            rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
+            AMDAIE::MemoryAccess::Read);
+        // Need to insert the second one because THIS is what will actually
+        // be used.
+        auto secondAccessOp =
+            rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+                rewriter.getUnknownLoc(),
+                reuseL1LogicalObjectFifoOp.getOutput(),
+                AMDAIE::MemoryAccess::Read);
+        rewriter.replaceOp(accessOp, secondAccessOp);
+      }
+    });
+  }
+
+  return success();
+}
+
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
index 919004949..82b342c48 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
@@ -11,13 +11,20 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
-/// Utility to split logicalobjectfifos given a struct
-/// `SplittingLogicalObjectFifoData` which contains all the required data to
-/// perform the splitting.
+/// Utility to help fetch those input DmaCpyNd Ops which needs to be split.
+SmallVector<AMDAIE::DmaCpyNdOp> fetchDmaCpyNdOpsToSplitOrCombine(
+    ModuleOp moduleOp);
+
+/// Utility to split logicalobjectfifos given a vector of L2->L1 dma ops.
 LogicalResult splitLogicalObjectFifos(
     IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
     MLIRContext *context);
 
+/// Utility to combine logicalobjectfifos given a vector of L2->L1 dma ops.
+LogicalResult combineLogicalObjectFifos(
+    IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
+    MLIRContext *context);
+
 }  // namespace mlir::iree_compiler::AMDAIE
 
 #endif
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
index e6736a7c9..4839246a4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
@@ -16,24 +16,6 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-/// Utility to help fetch those input DmaCpyNd Ops which needs to be split.
-static SmallVector<AMDAIE::DmaCpyNdOp> fetchDmaCpyNdOpsToSplit(
-    ModuleOp moduleOp) {
-  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps;
-  // We are currently walking through CoreOps gathering 3rd Input DmaOp (if
-  // applicable) from them.
-  // TODO(avarma): We will generalize this later.
-  moduleOp.walk([&](AMDAIE::CoreOp coreOp) {
-    SmallVector<Value> inputDmas = coreOp.getInputDmas();
-    if (inputDmas.size() != 3) return WalkResult::skip();
-    auto dmaCpyNdOp = inputDmas[2].getDefiningOp<AMDAIE::DmaCpyNdOp>();
-    assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op");
-    l2ToL1DmaOps.push_back(dmaCpyNdOp);
-    return WalkResult::advance();
-  });
-  return l2ToL1DmaOps;
-}
-
 class AMDAIESplitLogicalObjFifosForConnectionReusePass
     : public impl::AMDAIESplitLogicalObjFifosForConnectionReuseBase<
           AMDAIESplitLogicalObjFifosForConnectionReusePass> {
@@ -53,7 +35,7 @@ void AMDAIESplitLogicalObjFifosForConnectionReusePass::runOnOperation() {
   IRRewriter rewriter(context);
 
   SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps =
-      fetchDmaCpyNdOpsToSplit(moduleOp);
+      fetchDmaCpyNdOpsToSplitOrCombine(moduleOp);
 
   if (failed(splitLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) {
     LLVM_DEBUG(llvm::dbgs()
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 2979c71ef..ca7fc9bd5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -53,6 +53,7 @@ iree_cc_library(
     "AMDAIECanonicalizeDma.cpp"
     "AMDAIECanonicalizeNpuDmaCpyNd.cpp"
     "AMDAIECanonicalizeDoublyStridedOp.cpp"
+    "AMDAIECombineLogicalObjFifosForConnectionReuse.cpp"
     "AMDAIECombineStridedOps.cpp"
     "AMDAIEControlCodeLoopUnroll.cpp"
     "AMDAIEConvertCoreForallToFor.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 8912db52d..ec54736db 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -70,6 +70,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEPEELFORLOOP
 #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT
 #define GEN_PASS_DEF_AMDAIESINKINTOCORE
+#define GEN_PASS_DEF_AMDAIECOMBINELOGICALOBJFIFOSFORCONNECTIONREUSE
 #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE
 #define GEN_PASS_DEF_AMDAIETILE
 #define GEN_PASS_DEF_AMDAIETILEANDFUSE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index c1aa45c0b..fa2f73482 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -595,6 +595,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass());
+  passManager.addPass(createCSEPass());
+  passManager.addPass(createCanonicalizerPass());
 
   passManager.addPass(createAMDAIEDmaToCircularDmaPass());
   passManager.addNestedPass<func::FuncOp>(createAMDAIECreateAIEWorkgroupPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index fe5670067..b0689bffb 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -230,6 +230,9 @@ std::unique_ptr<Pass> createAMDAIEPeelForLoopPass(
 /// Create a pass to sink all dependencies into `amdaie.core` operations.
 std::unique_ptr<Pass> createAMDAIESinkIntoCorePass();
 
+/// Create a pass to combine logicalobjectfifos for connection reuse.
+std::unique_ptr<Pass> createAMDAIECombineLogicalObjFifosForConnectionReusePass();
+
 /// Create a pass to split logicalobjectfifos for connection reuse.
 std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosForConnectionReusePass();
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 73ceee040..8ea61e340 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -487,6 +487,11 @@ def AMDAIESplitLogicalObjFifosForConnectionReuse :
   let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations.";
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosForConnectionReusePass()";
 }
+def AMDAIECombineLogicalObjFifosForConnectionReuse :
+  Pass<"iree-amdaie-combine-logical-objectfifos-for-connection-reuse", "ModuleOp"> {
+  let summary = "Pass to combine L2 buffers to share inputs of Matmul and Elementwise operations.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECombineLogicalObjFifosForConnectionReusePass()";
+}
 
 def AMDAIETile :
     InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index ba4380860..261a8068c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -19,6 +19,7 @@ iree_lit_test_suite(
     "canonicalize_dma.mlir"
     "canonicalize_doubly_strided_op.mlir"
     "canonicalize_npu_dma_cpy_nd.mlir"
+    "combine_logicalobjfifos_for_connection_reuse.mlir"
     "combine_strided_ops.mlir"
     "controlcode_loop_unrolling.mlir"
     "convert_core_forall_to_for.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir
new file mode 100644
index 000000000..aee2023e3
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir
@@ -0,0 +1,219 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-combine-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+//   CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)>
+//   CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+//       CHECK: @combine_logical_objFifos
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//       CHECK:   memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #map(%[[IV1]])
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #map(%[[IV0]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]])
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[FIRST_READ]]
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[SECOND_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[FIRST_READ]]
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[SECOND_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @combine_logical_objFifos(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_4 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_5 = memref.alloc() : memref<128x128xi32>
+    %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_7 = amdaie.tile(%c0, %c2)
+    %tile_8 = amdaie.tile(%c1, %c2)
+    %tile_9 = amdaie.tile(%c0, %c3)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %2 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %5 = affine.apply #map(%arg5)
+      %6 = affine.apply #map1(%arg5)
+      %7 = affine.apply #map(%arg4)
+      %8 = affine.apply #map1(%arg4)
+      %9 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %10 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %11 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %12 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %15 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %13[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %16 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %14[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %17 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_7} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %20 = amdaie.core(%tile_7, in : [%15, %16, %19], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %21 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %1[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %23 = amdaie.core(%tile, in : [%15, %16, %22], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_8} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %25 = amdaie.dma_cpy_nd(%24[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %26 = amdaie.core(%tile_8, in : [%15, %16, %25], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%24, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %28 = amdaie.dma_cpy_nd(%27[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %29 = amdaie.core(%tile_9, in : [%15, %16, %28], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%27, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_5 : memref<128x128xi32>
+    memref.dealloc %alloc_1 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_4 : memref<1x1x32x32xi32, 1 : i32>
+    return
+  }
+}