From 45f4abc5a6345ba0e4edc0c24df08ef38f1c3436 Mon Sep 17 00:00:00 2001
From: "chen.qian" <chen.qian@espressif.com>
Date: Fri, 15 Nov 2024 16:15:20 +0800
Subject: [PATCH] [Pass] add LoopUnrollAndRemainder pass

---
 llvm/lib/Target/RISCV/CMakeLists.txt          |    1 +
 .../RISCV/RISCVLoopUnrollAndRemainder.cpp     | 5053 +++++++++++++++++
 .../RISCV/RISCVLoopUnrollAndRemainder.h       |   42 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |    7 +
 .../RISCV/RISCVLoopUnrollAndRemainder/add.ll  |  162 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/addc.ll |  126 +-
 .../RISCVLoopUnrollAndRemainder/ccorr.ll      |  454 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/conv.ll |  450 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/corr.ll |  242 +-
 .../RISCVLoopUnrollAndRemainder/dotprod.ll    |  129 +-
 .../dotprod_template_complex.ll               |  115 +-
 .../RISCVLoopUnrollAndRemainder/dotprode.ll   |  131 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/fir.ll  |  299 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/fird.ll |  327 +-
 .../loopsecvconstant.ll                       |   78 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/mul.ll  |  162 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/mulc.ll |  124 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll |  142 +-
 .../RISCV/RISCVLoopUnrollAndRemainder/sub.ll  |  162 +-
 19 files changed, 7831 insertions(+), 375 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp
 create mode 100644 llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h

diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 654c84b0695c27..05581a9a9af296 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_target(RISCVCodeGen
   RISCVGatherScatterLowering.cpp
   RISCVSplitLoopByLength.cpp
   RISCVCustomLICM.cpp
+  RISCVLoopUnrollAndRemainder.cpp
   RISCVInsertVSETVLI.cpp
   RISCVInsertReadWriteCSR.cpp
   RISCVInsertWriteVXRM.cpp
diff --git a/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp
new file mode 100644
index 00000000000000..587090ec2cf922
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp
@@ -0,0 +1,5053 @@
+//===-- RISCVLoopUnrollAndRemainder.cpp - Loop Unrolling Pass
+//------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a loop unrolling optimization pass specifically designed
+// for Digital Signal Processing (DSP) algorithms. The pass targets common
+// computational patterns found in various DSP operations including:
+// - FIR and IIR filters
+// - Convolution and correlation
+// - Vector operations
+// - Dot product calculations
+// - Mathematical functions
+//
+// The pass performs the following main operations:
+// 1. Identifies loops in DSP algorithm implementations
+// 2. Unrolls the main computational loops, typically by a factor of 8
+// 3. Efficiently handles remainder iterations
+// 4. Optimizes memory access patterns for improved cache utilization
+// 5. Adjusts control flow and PHI nodes to support the unrolled structure
+// 6. Performs cleanup and further optimization after unrolling
+//
+// This transformation can significantly improve performance for DSP algorithms
+// by:
+// - Increasing instruction-level parallelism
+// - Improving cache utilization for data and coefficient access
+// - Reducing loop overhead
+// - Enabling better vectorization opportunities
+//
+// The pass is particularly effective for algorithms with intensive loop-based
+// computations, where the main computational loop dominates the execution time.
+// It aims to optimize both the main loop body and the handling of edge cases,
+// providing a balance between performance and code size.
+//
+//===----------------------------------------------------------------------===//
+#include "RISCVLoopUnrollAndRemainder.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iostream>
+#include <limits>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-loop-unroll-and-remainder"
+
+// Enumeration to represent different types of unrolling
+enum class UnrollType {
+  DOTPROD,
+  ADD_ADDC_SUB_MUL_MULC_SQRT,
+  CONV_CCORR,
+  FIRD,
+  FIR,
+  CORR,
+  UNKNOWN
+};
+
+// Global variable to store the current unroll type
+static UnrollType currentUnrollType = UnrollType::UNKNOWN;
+
+// Command line option to enable the RISCVLoopUnrollAndRemainder pass
+cl::opt<bool> llvm::EnableRISCVLoopUnrollAndRemainder(
+    "riscv-loop-unroll-and-remainder", cl::init(false),
+    cl::desc("Enable loop unrolling and remainder specific loop"));
+
+// Helper function to get a basic block by name from a function
+static BasicBlock *getBasicBlockByName(Function &F, StringRef Name) {
+  for (BasicBlock &BB : F)
+    if (BB.getName() == Name)
+      return &BB;
+  return nullptr;
+}
+
+// Helper function to get the first ICmp instruction with a specific predicate
+// in a basic block
+static ICmpInst *getFirstICmpInstWithPredicate(BasicBlock *BB,
+                                               ICmpInst::Predicate Predicate) {
+  for (Instruction &I : *BB) {
+    if (auto *CI = dyn_cast<ICmpInst>(&I)) {
+      if (CI->getPredicate() == Predicate) {
+        return CI;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the last ICmp instruction with a specific predicate in
+// a basic block
+static ICmpInst *getLastICmpInstWithPredicate(BasicBlock *BB,
+                                              ICmpInst::Predicate Predicate) {
+  ICmpInst *lastICmp = nullptr;
+  for (Instruction &I : *BB) {
+    if (auto *CI = dyn_cast<ICmpInst>(&I)) {
+      if (CI->getPredicate() == Predicate) {
+        lastICmp = CI;
+      }
+    }
+  }
+  return lastICmp;
+}
+
+// Helper function to get the first ICmp instruction in a basic block
+static ICmpInst *getFirstICmpInst(BasicBlock *BB) {
+  for (Instruction &I : *BB) {
+    if (auto *CI = dyn_cast<ICmpInst>(&I)) {
+      return CI;
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the last ICmp instruction in a basic block
+static ICmpInst *getLastICmpInst(BasicBlock *BB) {
+  for (auto it = BB->rbegin(); it != BB->rend(); ++it) {
+    if (auto *icmp = dyn_cast<ICmpInst>(&*it)) {
+      return icmp;
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the first float PHI node in a basic block
+static PHINode *getFirstFloatPhi(BasicBlock *BB) {
+  for (auto &Inst : *BB) {
+    if (auto *Phi = dyn_cast<PHINode>(&Inst)) {
+      if (Phi->getType()->isFloatTy()) {
+        return Phi;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the last float PHI node in a basic block
+static PHINode *getLastFloatPhi(BasicBlock *BB) {
+  for (auto it = BB->rbegin(); it != BB->rend(); ++it) {
+    if (auto *Phi = dyn_cast<PHINode>(&*it)) {
+      if (Phi->getType()->isFloatTy()) {
+        return Phi;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the first 32-bit integer PHI node in a basic block
+static PHINode *getFirstI32Phi(BasicBlock *BB) {
+  for (auto &Inst : *BB) {
+    if (auto *Phi = dyn_cast<PHINode>(&Inst)) {
+      if (Phi->getType()->isIntegerTy(32)) {
+        return Phi;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the last 32-bit integer PHI node in a basic block
+static PHINode *getLastI32Phi(BasicBlock *BB) {
+  for (auto it = BB->rbegin(); it != BB->rend(); ++it) {
+    if (auto *Phi = dyn_cast<PHINode>(&*it)) {
+      if (Phi->getType()->isIntegerTy(32)) {
+        return Phi;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the last PHI node in a basic block
+static PHINode *getLastPhi(BasicBlock *BB) {
+  for (auto it = BB->rbegin(); it != BB->rend(); ++it) {
+    if (auto *Phi = dyn_cast<PHINode>(&*it)) {
+      return Phi;
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to get the first CallInst with a specific name in a basic
+// block
+static CallInst *getFirstCallInstWithName(BasicBlock *BB, StringRef Name) {
+  for (Instruction &I : *BB) {
+    if (auto *Call = dyn_cast<CallInst>(&I)) {
+      if (Call->getCalledFunction() &&
+          Call->getCalledFunction()->getName() == Name) {
+        return Call;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to update operands of new instructions
+static void updateOperands(SmallVector<Instruction *, 8> &NewInsts,
+                           ValueToValueMapTy &ValueMap) {
+  for (Instruction *inst : NewInsts) {
+    for (unsigned i = 0; i < inst->getNumOperands(); i++) {
+      Value *op = inst->getOperand(i);
+      if (ValueMap.count(op)) {
+        inst->setOperand(i, ValueMap[op]);
+      }
+    }
+  }
+}
+
+// Helper function to swap the successors of a terminator instruction
+static void swapTerminatorSuccessors(BasicBlock *BB) {
+  if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    if (BI->isConditional() && BI->getNumSuccessors() == 2) {
+      BasicBlock *TrueSuccessor = BI->getSuccessor(0);
+      BasicBlock *FalseSuccessor = BI->getSuccessor(1);
+      BI->setSuccessor(0, FalseSuccessor);
+      BI->setSuccessor(1, TrueSuccessor);
+    } else {
+      llvm_unreachable("BB's terminator is not a conditional branch or doesn't "
+                       "have two successors");
+    }
+  } else {
+    llvm_unreachable("BB's terminator is not a branch instruction");
+  }
+}
+
+// Helper function to clone a basic block and update its relations
+static BasicBlock *cloneBasicBlockWithRelations(BasicBlock *BB,
+                                                const std::string &NameSuffix,
+                                                Function *F) {
+  ValueToValueMapTy VMap;
+  BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+
+  // Update instruction references in the new block
+  for (Instruction &I : *NewBB) {
+    // Update operands
+    for (Use &U : I.operands()) {
+      Value *V = U.get();
+      Value *NewV = VMap[V];
+      if (NewV) {
+        U.set(NewV);
+      }
+    }
+
+    // Update PHI node basic block references
+    if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *IncomingBB = PN->getIncomingBlock(i);
+        if (IncomingBB == BB) {
+          PN->setIncomingBlock(i, NewBB);
+        } else if (VMap.count(IncomingBB)) {
+          PN->setIncomingBlock(i, cast<BasicBlock>(VMap[IncomingBB]));
+        }
+      }
+    }
+  }
+
+  return NewBB;
+}
+
+// Helper function to unroll and duplicate a loop iteration
+static Instruction *unrollAndDuplicateLoopIteration(LLVMContext &Ctx,
+                                                    BasicBlock *BB,
+                                                    IRBuilder<> &Builder,
+                                                    unsigned int i) {
+  PHINode *IPhi = dyn_cast<PHINode>(&BB->front());
+  BasicBlock::iterator BeginIt, EndIt, ToIt;
+  SmallVector<Instruction *, 8> newInsts;
+  ValueToValueMapTy ValueMap;
+  Instruction *Add = nullptr;
+  Instruction *tailcallfmuladd = nullptr;
+  Instruction *duplicatedPhiNode = nullptr;
+
+  // Find the range of instructions to duplicate
+  for (Instruction &I : *BB) {
+    if (auto *phi = dyn_cast<PHINode>(&I)) {
+      if (phi->getType()->isFloatTy()) {
+        BeginIt = I.getIterator();
+      }
+    } else if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) {
+      EndIt = std::next(I.getIterator());
+      tailcallfmuladd = &I;
+      ToIt = std::next(EndIt);
+      break;
+    }
+  }
+
+  assert(&*BeginIt && &*EndIt && "Failed to find instruction range");
+
+  // Clone and modify instructions
+  int arrayidx = 0;
+  for (auto it = BeginIt; it != EndIt; ++it) {
+    Instruction *newInst = it->clone();
+    if (newInst->getOpcode() == Instruction::PHI)
+      newInst->setName("acc" + Twine(i));
+
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(newInst)) {
+      if (!Add)
+        Add = BinaryOperator::CreateDisjoint(
+            Instruction::Or, IPhi, ConstantInt::get(Type::getInt32Ty(Ctx), i),
+            "add" + Twine(i), BB);
+
+      newInst->setName("arrayidx" + Twine(i) + "_" + Twine(arrayidx));
+      newInst->setOperand(1, Add);
+      arrayidx++;
+    }
+    newInsts.push_back(newInst);
+    ValueMap[&*it] = newInst;
+  }
+
+  // Update operands and insert new instructions
+  updateOperands(newInsts, ValueMap);
+  for (Instruction *newInst : newInsts) {
+    if (newInst->getOpcode() == Instruction::PHI)
+      duplicatedPhiNode = newInst->clone();
+    newInst->insertInto(BB, BB->end());
+  }
+
+  return duplicatedPhiNode;
+}
+
+// Helper function to move PHI nodes to the top of a basic block
+static void movePHINodesToTop(BasicBlock &BB,
+                              BasicBlock *ForBodyPreheaderBB = nullptr) {
+  SmallVector<PHINode *, 8> PHIs;
+  for (Instruction &I : BB) {
+    if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+      if (ForBodyPreheaderBB)
+        PHI->setIncomingBlock(1, ForBodyPreheaderBB);
+      PHIs.push_back(PHI);
+    }
+  }
+
+  // Move PHI nodes in reverse order
+  for (auto it = PHIs.rbegin(); it != PHIs.rend(); ++it) {
+    (*it)->moveBefore(&BB.front());
+  }
+}
+
+// Helper function to update predecessors to point to a new preheader
+static void updatePredecessorsToPreheader(BasicBlock *ForBody,
+                                          BasicBlock *ForBodyPreheader) {
+  SmallVector<BasicBlock *, 4> predecessors_bb;
+  for (auto *Pred : predecessors(ForBody)) {
+    if (Pred != ForBody)
+      predecessors_bb.push_back(Pred);
+  }
+
+  for (BasicBlock *Pred : predecessors_bb) {
+    Instruction *TI = Pred->getTerminator();
+    for (unsigned i = 0; i < TI->getNumSuccessors(); ++i) {
+      if (TI->getSuccessor(i) == ForBody) {
+        TI->setSuccessor(i, ForBodyPreheader);
+      }
+    }
+  }
+
+  if (!ForBodyPreheader->getTerminator()) {
+    BranchInst::Create(ForBody, ForBodyPreheader);
+  }
+}
+
+// Helper function to get the 'len' value from the entry block
+static Value *getLenFromEntryBlock(Function &F) {
+  ICmpInst *ICmp = nullptr;
+  for (BasicBlock &BB : F) {
+    ICmp = getFirstICmpInstWithPredicate(&BB, ICmpInst::ICMP_SGT);
+    if (ICmp)
+      break;
+  }
+
+  assert(ICmp && "icmp sgt instruction not found");
+  return ICmp->getOperand(0);
+}
+
+// Helper function to find specific instructions in a basic block
+static std::tuple<PHINode *, CallInst *, BinaryOperator *>
+findKeyInstructions(BasicBlock *ForBody) {
+  PHINode *ThirdPHI = nullptr;
+  CallInst *callInst = nullptr;
+  BinaryOperator *addInst = nullptr;
+  int PHICount = 0;
+
+  for (Instruction &I : *ForBody) {
+    if (auto *PHI = dyn_cast<PHINode>(&I)) {
+      PHICount++;
+      if (PHICount == 3) {
+        ThirdPHI = PHI;
+      }
+    } else if (auto *ci = dyn_cast<CallInst>(&I)) {
+      callInst = ci;
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+      if (BinOp->getOpcode() == Instruction::Add) {
+        addInst = BinOp;
+      }
+    }
+  }
+
+  return std::make_tuple(ThirdPHI, callInst, addInst);
+}
+
+// Helper function to rename instructions
+static void renameInstruction(Instruction *inst) {
+  if (inst->getOpcode() == Instruction::PHI) {
+    inst->setName("acc");
+  } else if (inst->getOpcode() == Instruction::GetElementPtr) {
+    inst->setName("arrayidx");
+  }
+}
+
+// Helper function to set add instruction in for body
+static void setAddInForBody(Instruction *inst, Instruction *Add,
+                            Instruction *InsertBefore) {
+  if (inst->getOpcode() == Instruction::PHI) {
+    Add->moveBefore(InsertBefore);
+  } else if (inst->getOpcode() == Instruction::GetElementPtr) {
+    inst->setOperand(1, Add);
+  }
+}
+
+// Helper function to copy and remap instructions
+static void copyAndRemapInstructions(Instruction *StartInst,
+                                     Instruction *EndInst,
+                                     Instruction *InsertBefore,
+                                     Instruction *Add) {
+  ValueToValueMapTy ValueMap;
+  SmallVector<Instruction *, 8> NewInsts;
+
+  for (auto it = StartInst->getIterator(); &*it != EndInst; ++it) {
+    Instruction *newInst = it->clone();
+    if (auto *BinOp = dyn_cast<BinaryOperator>(newInst)) {
+      if (BinOp->getOpcode() == Instruction::Add) {
+        continue;
+      }
+    }
+    NewInsts.push_back(newInst);
+    ValueMap[&*it] = newInst;
+  }
+
+  updateOperands(NewInsts, ValueMap);
+
+  for (Instruction *newInst : NewInsts) {
+    renameInstruction(newInst);
+    newInst->insertBefore(InsertBefore);
+    setAddInForBody(newInst, Add, InsertBefore);
+  }
+}
+
+// Helper function to preprocess the cloned for body
+static void preProcessClonedForBody(BasicBlock *ClonedForBody, Value *sub) {
+  Instruction *addInst = nullptr;
+  for (Instruction &I : *ClonedForBody) {
+    if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+      if (BinOp->getOpcode() == Instruction::Add) {
+        BinOp->setOperand(1, ConstantInt::get(BinOp->getType(), 8));
+        addInst = BinOp;
+      }
+    }
+    if (auto *icmp = dyn_cast<ICmpInst>(&I)) {
+      icmp->setPredicate(CmpInst::Predicate::ICMP_SLT);
+      icmp->setOperand(0, addInst);
+      icmp->setOperand(1, sub);
+      icmp->setName("cmp11");
+    }
+  }
+  LLVM_DEBUG(ClonedForBody->dump());
+}
+
+// Helper function to modify getelementptr instructions
+static void modifyGetElementPtr(BasicBlock *BB) {
+  SmallVector<GetElementPtrInst *, 8> gepInsts;
+  Value *firstGEPOperand0 = nullptr;
+  Value *secondGEPOperand1 = nullptr;
+
+  for (Instruction &I : *BB) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      gepInsts.push_back(GEP);
+    }
+  }
+
+  if (gepInsts.size() < 8 || gepInsts.size() % 2 != 0) {
+    return;
+  }
+
+  firstGEPOperand0 = gepInsts[0];
+  secondGEPOperand1 = gepInsts[1];
+
+  for (size_t i = 2; i < gepInsts.size(); ++i) {
+    if (i % 2 == 0) {
+      if (i < gepInsts.size() - 2) {
+        gepInsts[i]->setOperand(0, firstGEPOperand0);
+      }
+    } else {
+      gepInsts[i]->setOperand(0, secondGEPOperand1);
+    }
+
+    if (i == 14)
+      continue;
+
+    Instruction *operand1 = dyn_cast<Instruction>(gepInsts[i]->getOperand(1));
+    gepInsts[i]->setOperand(
+        1, ConstantInt::get(Type::getInt32Ty(BB->getContext()), i / 2));
+    if (operand1 && operand1->use_empty()) {
+      operand1->eraseFromParent();
+    }
+  }
+}
+
+// Helper function to check if a PHI node has an incoming value of zero
+static bool isIncomingValueZeroOfPhi(PHINode *phi) {
+  return phi->getType()->isIntegerTy(32) &&
+         isa<ConstantInt>(phi->getIncomingValue(0)) &&
+         cast<ConstantInt>(phi->getIncomingValue(0))->isZero();
+}
+
+// Helper function to find and set add instructions
+static std::pair<Instruction *, Instruction *>
+findAndSetAddInstructions(BasicBlock *ClonedForBody) {
+  Instruction *FirstAdd = nullptr;
+  Instruction *SecondAdd = nullptr;
+
+  for (Instruction &I : *ClonedForBody) {
+    if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(&I)) {
+      if (BinOp->getOpcode() == Instruction::Add) {
+        if (!FirstAdd) {
+          FirstAdd = &I;
+          FirstAdd->setHasNoSignedWrap(true);
+        } else if (!SecondAdd) {
+          SecondAdd = &I;
+          break;
+        }
+      }
+    }
+  }
+  assert(FirstAdd && SecondAdd && "Failed to find matching add instructions");
+  return std::make_pair(FirstAdd, SecondAdd);
+}
+
+// Helper functions for PHI node manipulation
+
+static PHINode *findZeroInitializedPHI(BasicBlock *block) {
+  for (Instruction &I : *block) {
+    if (PHINode *phi = dyn_cast<PHINode>(&I)) {
+      if (isIncomingValueZeroOfPhi(phi)) {
+        return phi;
+      }
+    }
+  }
+  return nullptr;
+}
+
+static PHINode *findIntegerPHI(BasicBlock *block) {
+  for (Instruction &I : *block) {
+    if (PHINode *phi = dyn_cast<PHINode>(&I)) {
+      if (phi->getType()->isIntegerTy(32) && !isIncomingValueZeroOfPhi(phi)) {
+        return phi;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Helper function to unroll loop body
+static void unrollLoopBody(BasicBlock *block, PHINode *thirdPHI,
+                           Instruction *callInst, Instruction *addInst,
+                           PHINode *zeroInitializedPHI, LLVMContext &context) {
+  for (int i = 1; i < 8; i++) {
+    Instruction *add = BinaryOperator::CreateDisjoint(
+        Instruction::Or, zeroInitializedPHI,
+        ConstantInt::get(Type::getInt32Ty(context), i), "add" + Twine(i),
+        block);
+    copyAndRemapInstructions(thirdPHI, callInst->getNextNode(), addInst, add);
+  }
+}
+
+// Helper function to update add instruction
+static void updateAddInstruction(Instruction *addInst, PHINode *integerPHI,
+                                 LLVMContext &context) {
+  if (addInst) {
+    addInst->setOperand(1, ConstantInt::get(Type::getInt32Ty(context), 8));
+    addInst->setOperand(0, integerPHI);
+  }
+}
+
+// Helper function to update block terminator
+static void updateBlockTerminator(BasicBlock *block, BasicBlock *successor) {
+  Instruction *terminator = block->getTerminator();
+  terminator->setSuccessor(0, block);
+  terminator->setSuccessor(1, successor);
+}
+
+// Helper function to modify getelementptr for unrolling
+static void modifyGetElementPtrForUnrolling(BasicBlock *block) {
+  SmallVector<GetElementPtrInst *, 8> gepInsts;
+  for (Instruction &I : *block) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      gepInsts.push_back(GEP);
+    }
+  }
+
+  for (size_t i = 2; i < gepInsts.size(); i += 2) {
+    gepInsts[i]->setOperand(0, gepInsts[0]);
+    gepInsts[i]->setOperand(
+        1, ConstantInt::get(Type::getInt32Ty(block->getContext()), i / 2));
+  }
+}
+
+// Helper function to handle add instructions
+static void handleAddInstructions(BasicBlock *block, unsigned int unrollFactor,
+                                  PHINode *zeroInitializedPHI,
+                                  LLVMContext &context) {
+  auto [firstAdd, secondAdd] = findAndSetAddInstructions(block);
+
+  if (firstAdd && secondAdd) {
+    firstAdd->moveBefore(secondAdd);
+
+    if (unrollFactor == 1) {
+      firstAdd->setOperand(1, ConstantInt::get(Type::getInt32Ty(context), 8));
+      secondAdd->setOperand(0, zeroInitializedPHI);
+    }
+  }
+}
+
+// Function to unroll the cloned for loop body
+static void unrollClonedForBody(BasicBlock *clonedForBody,
+                                BasicBlock *forCondPreheader,
+                                unsigned int unrollFactor = 0) {
+  Function *function = clonedForBody->getParent();
+  LLVMContext &context = function->getContext();
+
+  // Find key instructions in the cloned for body
+  auto [thirdPHI, callInst, addInst] = findKeyInstructions(clonedForBody);
+  PHINode *zeroInitializedPHI = findZeroInitializedPHI(clonedForBody);
+  PHINode *integerPHI = findIntegerPHI(clonedForBody);
+
+  assert(zeroInitializedPHI && "No matching zero-initialized PHI node found");
+
+  // Unroll the loop body if key instructions are found
+  if (thirdPHI && callInst) {
+    unrollLoopBody(clonedForBody, thirdPHI, callInst, addInst,
+                   zeroInitializedPHI, context);
+  }
+
+  // Update the add instruction
+  updateAddInstruction(addInst, integerPHI, context);
+
+  // Update the basic block terminator
+  updateBlockTerminator(clonedForBody, forCondPreheader);
+
+  // Move PHI nodes to the top of the basic block
+  movePHINodesToTop(*clonedForBody);
+
+  // Modify getelementptr instructions based on the unroll factor
+  if (unrollFactor == 0) {
+    modifyGetElementPtr(clonedForBody);
+  } else {
+    modifyGetElementPtrForUnrolling(clonedForBody);
+  }
+
+  // Handle add instructions
+  handleAddInstructions(clonedForBody, unrollFactor, zeroInitializedPHI,
+                        context);
+}
+
+// Function to check if a call instruction can be moved
+static bool canMoveCallInstruction(CallInst *callInst,
+                                   Instruction *insertPoint) {
+  for (unsigned i = 0; i < callInst->getNumOperands(); ++i) {
+    if (auto *operandInst = dyn_cast<Instruction>(callInst->getOperand(i))) {
+      if (operandInst->getParent() == callInst->getParent() &&
+          insertPoint->comesBefore(operandInst)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+// Function to group and reorder instructions in a basic block
+static void groupAndReorderInstructions(BasicBlock *clonedForBody) {
+  // Collect different types of instructions
+  SmallVector<PHINode *> phiNodes;
+  SmallVector<Instruction *> orInsts, gepInsts, loadInsts, storeInsts, mulInsts,
+      addInsts, subInsts, callInsts, ashrInsts, faddInsts, fmulInsts, fsubInsts;
+
+  // Categorize instructions by type
+  for (Instruction &I : *clonedForBody) {
+    if (auto *phi = dyn_cast<PHINode>(&I)) {
+      phiNodes.push_back(phi);
+    } else if (I.getOpcode() == Instruction::Or) {
+      orInsts.push_back(&I);
+    } else if (isa<GetElementPtrInst>(&I)) {
+      gepInsts.push_back(&I);
+    } else if (isa<LoadInst>(&I)) {
+      loadInsts.push_back(&I);
+    } else if (isa<StoreInst>(&I)) {
+      storeInsts.push_back(&I);
+    } else if (I.getOpcode() == Instruction::Mul) {
+      mulInsts.push_back(&I);
+    } else if (isa<CallInst>(&I)) {
+      callInsts.push_back(&I);
+    } else if (I.getOpcode() == Instruction::Add) {
+      addInsts.push_back(&I);
+    } else if (I.getOpcode() == Instruction::Sub) {
+      subInsts.push_back(&I);
+    } else if (I.getOpcode() == Instruction::FAdd) {
+      faddInsts.push_back(&I);
+    } else if (I.getOpcode() == Instruction::FMul) {
+      fmulInsts.push_back(&I);
+    } else if (I.getOpcode() == Instruction::FSub) {
+      fsubInsts.push_back(&I);
+    } else if (I.getOpcode() == Instruction::AShr) {
+      return;
+    }
+  }
+
+  // If no PHI nodes are found, return
+  if (phiNodes.empty()) {
+    return;
+  }
+
+  // Reorder instructions
+  Instruction *insertPoint = phiNodes.back()->getNextNode();
+  bool canMoveCallInst =
+      callInsts.empty() ||
+      canMoveCallInstruction(dyn_cast<CallInst>(callInsts[0]), insertPoint);
+
+  auto moveInstructions = [&insertPoint](SmallVector<Instruction *> &insts) {
+    for (auto *inst : insts) {
+      inst->moveBefore(insertPoint);
+      insertPoint = inst->getNextNode();
+    }
+  };
+
+  // Move instructions in the desired order
+  moveInstructions(mulInsts);
+  moveInstructions(addInsts);
+  moveInstructions(orInsts);
+  moveInstructions(subInsts);
+  moveInstructions(gepInsts);
+  moveInstructions(loadInsts);
+  moveInstructions(faddInsts);
+  moveInstructions(fmulInsts);
+  moveInstructions(fsubInsts);
+  if (canMoveCallInst) {
+    moveInstructions(callInsts);
+  }
+}
+
+// Function to transform a single loop depth (currently suitable for
+// dotprod/dotprode example)
+static bool transformOneLoopDepth(Function &F) {
+  LLVMContext &ctx = F.getContext();
+  bool changed = false;
+
+  // Get necessary basic blocks and values
+  Value *len = getLenFromEntryBlock(F);
+  BasicBlock *entryBB = &F.getEntryBlock();
+  BasicBlock *forBodyBB = getBasicBlockByName(F, "for.body");
+  BasicBlock *forBodyNewBB = getBasicBlockByName(F, "for.body.clone");
+  BasicBlock *ifEnd = getBasicBlockByName(F, "if.end");
+  BasicBlock *forCond46PreheaderBB =
+      getBasicBlockByName(F, "for.cond.preheader");
+
+  assert(forBodyBB && "Expected to find for.body!");
+  assert(forBodyNewBB && "Expected to find for.body.clone!");
+  assert(ifEnd && "Expected to find if.end!");
+  assert(forCond46PreheaderBB && "Expected to find for.cond.preheader!");
+
+  // Create new basic blocks
+  BasicBlock *forCondPreheaderBB =
+      BasicBlock::Create(F.getContext(), "for.cond.preheader", &F, forBodyBB);
+  BasicBlock *forBodyPreheaderBB =
+      BasicBlock::Create(F.getContext(), "for.body.preheader", &F, forBodyBB);
+  BasicBlock *forCond31PreheaderBB =
+      BasicBlock::Create(F.getContext(), "for.cond31.preheader", &F, forBodyBB);
+  BasicBlock *forBody33BB = cloneBasicBlockWithRelations(forBodyBB, "33", &F);
+  forBody33BB->setName("for.body33");
+  forBody33BB->moveAfter(forBodyBB);
+  BasicBlock *forEnd37BB =
+      BasicBlock::Create(F.getContext(), "for.end37", &F, forBodyNewBB);
+
+  // Add instructions to forCondPreheaderBB
+  IRBuilder<> builder(forCondPreheaderBB);
+  Value *negativeSeven = ConstantInt::get(Type::getInt32Ty(F.getContext()), -7);
+  Value *sub = builder.CreateNSWAdd(len, negativeSeven, "sub");
+  Value *seven = ConstantInt::get(Type::getInt32Ty(F.getContext()), 7);
+  Value *cmp1113 = builder.CreateICmpUGT(len, seven, "cmp1113");
+  builder.CreateCondBr(cmp1113, forBodyPreheaderBB, forCond31PreheaderBB);
+
+  // Add instructions to forBodyPreheaderBB
+  builder.SetInsertPoint(forBodyPreheaderBB);
+  Value *mask = ConstantInt::get(Type::getInt32Ty(F.getContext()), 2147483640);
+  Value *andValue = builder.CreateAnd(len, mask, "");
+  builder.CreateBr(forBodyBB);
+
+  // Modify for.body
+  PHINode *iPhi = dyn_cast<PHINode>(&forBodyBB->front());
+  iPhi->setName("i.0122");
+
+  // copy first float phinode from forBodyBB to forCond31PreheaderBB
+  PHINode *firstFloatPhi = getFirstFloatPhi(forBodyBB);
+  PHINode *acc00Lcssa = PHINode::Create(firstFloatPhi->getType(), 2,
+                                        "acc0.0.lcssa", forCond31PreheaderBB);
+  acc00Lcssa->addIncoming(firstFloatPhi->getIncomingValue(0),
+                          firstFloatPhi->getIncomingBlock(0));
+  acc00Lcssa->addIncoming(firstFloatPhi->getIncomingValue(1),
+                          forCondPreheaderBB);
+  // Unroll and duplicate loop iterations
+  SmallVector<Instruction *> instructions;
+  for (int i = 0; i < 7; i++) {
+    Instruction *copyedPhiNode =
+        unrollAndDuplicateLoopIteration(ctx, forBodyBB, builder, i + 1);
+    if (PHINode *phi = dyn_cast<PHINode>(copyedPhiNode)) {
+      phi->setName("acc" + Twine(i + 1) + ".0.lcssa");
+      phi->setIncomingBlock(1, forCondPreheaderBB);
+      phi->insertInto(forCond31PreheaderBB, forCond31PreheaderBB->end());
+      instructions.push_back(phi);
+    }
+  }
+
+  // Update for.body terminator
+  Instruction *incInst = nullptr;
+  MDNode *loopMD = nullptr;
+  for (auto &I : *forBodyBB) {
+    if (I.getOpcode() == Instruction::Add) {
+      incInst = &I;
+      Instruction *icmp = I.getNextNode();
+      Instruction *br = icmp->getNextNode();
+      assert(icmp->getOpcode() == Instruction::ICmp &&
+             br->getOpcode() == Instruction::Br &&
+             "Unexpected instruction sequence");
+      I.moveAfter(&forBodyBB->back());
+      loopMD = br->getMetadata(LLVMContext::MD_loop);
+      br->eraseFromParent();
+      icmp->eraseFromParent();
+      break;
+    }
+  }
+
+  // Modify add instruction
+  incInst->setOperand(1, ConstantInt::get(Type::getInt32Ty(F.getContext()), 8));
+  incInst->setName("add30");
+
+  builder.SetInsertPoint(forBodyBB);
+  Value *cmp1 = builder.CreateICmpSLT(incInst, sub, "cmp1");
+  BranchInst *newBr =
+      builder.CreateCondBr(cmp1, forBodyBB, forCond31PreheaderBB);
+  newBr->setMetadata(LLVMContext::MD_loop, loopMD);
+
+  movePHINodesToTop(*forBodyBB, forBodyPreheaderBB);
+
+  // Add instructions to forCond31PreheaderBB
+  builder.SetInsertPoint(forCond31PreheaderBB);
+  PHINode *i0Lcssa =
+      builder.CreatePHI(Type::getInt32Ty(F.getContext()), 0, "i.0.lcssa");
+  i0Lcssa->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+                       forCondPreheaderBB);
+  i0Lcssa->addIncoming(andValue, forBodyBB);
+  Value *cmp32132 = builder.CreateICmpSLT(i0Lcssa, len, "cmp32132");
+  builder.CreateCondBr(cmp32132, forBody33BB, forEnd37BB);
+
+  // Modify forBody33BB
+  Instruction *tempInstr = nullptr;
+  for (auto &I : *forBody33BB) {
+    if (PHINode *phi = dyn_cast<PHINode>(&I)) {
+      if (phi->getType()->isIntegerTy(32)) {
+        phi->setIncomingValue(1, i0Lcssa);
+        phi->setIncomingBlock(1, forCond31PreheaderBB);
+      } else if (phi->getType()->isFloatTy()) {
+        phi->setIncomingValue(1, acc00Lcssa);
+        phi->setIncomingBlock(1, forCond31PreheaderBB);
+        tempInstr = phi;
+      }
+    }
+  }
+
+  // Modify forEnd37BB
+  Instruction *acc01Lcssa = tempInstr->clone();
+  acc01Lcssa->setName("acc0.1.lcssa");
+  acc01Lcssa->insertInto(forEnd37BB, forEnd37BB->end());
+  builder.SetInsertPoint(forEnd37BB);
+
+  // Create pairs of floating-point additions
+  Value *sum01 = builder.CreateFAdd(acc01Lcssa, instructions[0], "sum01");
+  Value *sum23 = builder.CreateFAdd(instructions[1], instructions[2], "sum23");
+  Value *sum45 = builder.CreateFAdd(instructions[3], instructions[4], "sum45");
+  Value *sum67 = builder.CreateFAdd(instructions[5], instructions[6], "sum67");
+
+  // Combine pairs
+  Value *sum0123 = builder.CreateFAdd(sum01, sum23, "sum0123");
+  Value *sum4567 = builder.CreateFAdd(sum45, sum67, "sum4567");
+
+  // Final addition
+  Value *currentAdd = builder.CreateFAdd(sum0123, sum4567, "add44");
+  builder.CreateBr(ifEnd);
+
+  // Modify entry basic block
+  BranchInst *entryBi = dyn_cast<BranchInst>(entryBB->getTerminator());
+  entryBi->setSuccessor(0, forCondPreheaderBB);
+  entryBi->setSuccessor(1, forCond46PreheaderBB);
+
+  // Modify forCond46PreheaderBB
+  forCond46PreheaderBB->getTerminator()->getPrevNode()->setName("cmp47110");
+
+  // Modify for.body33
+  BranchInst *forBody33Bi = dyn_cast<BranchInst>(forBody33BB->getTerminator());
+  forBody33Bi->setSuccessor(0, forEnd37BB);
+  forBody33Bi->setSuccessor(1, forBody33BB);
+
+  // Modify if.end
+  PHINode *ifEndPhi = dyn_cast<PHINode>(&ifEnd->front());
+  ifEndPhi->setIncomingValue(1, currentAdd);
+  ifEndPhi->setIncomingBlock(1, forEnd37BB);
+
+  changed = true;
+  return changed;
+}
+
+// Function to unroll the cloned for.cond.preheader
+static void unrollClonedForCondPreheader(BasicBlock *clonedForBody,
+                                         BasicBlock *clonedForCondPreheader,
+                                         BasicBlock *forCondPreheader) {
+  Function *F = clonedForBody->getParent();
+  BasicBlock *forBody = getBasicBlockByName(*F, "for.body");
+  assert(forBody && "Expected to find for.body!");
+
+  // Find PHI instructions in clonedForBody
+  SmallVector<PHINode *> phiNodes;
+  for (Instruction &I : *clonedForBody) {
+    if (PHINode *phi = dyn_cast<PHINode>(&I)) {
+      phiNodes.push_back(phi);
+    }
+  }
+
+  // Remove unused PHI nodes in clonedForCondPreheader
+  SmallVector<PHINode *> unusedPhiNodes;
+  for (Instruction &I : *clonedForCondPreheader) {
+    if (PHINode *phi = dyn_cast<PHINode>(&I)) {
+      if (phi->use_empty()) {
+        unusedPhiNodes.push_back(phi);
+      }
+    }
+  }
+  for (PHINode *phi : unusedPhiNodes) {
+    phi->eraseFromParent();
+  }
+
+  // Clone PHI instructions to the beginning of clonedForCondPreheader
+  Instruction *insertPoint = &clonedForCondPreheader->front();
+  SmallVector<PHINode *> clonedPhiNodes;
+  for (PHINode *phi : phiNodes) {
+    PHINode *clonedPhi = cast<PHINode>(phi->clone());
+    clonedPhi->setName(phi->getName() + ".clone");
+    clonedPhi->setIncomingBlock(0, forBody);
+    clonedPhi->insertBefore(insertPoint);
+    insertPoint = clonedPhi->getNextNode();
+    clonedPhiNodes.push_back(clonedPhi);
+  }
+
+  // Find and clone the unique icmp instruction in forBody
+  Value *specStoreSelect = nullptr;
+  Instruction *cmpSlt = nullptr;
+  for (Instruction &I : *forBody) {
+    if (auto *icmp = dyn_cast<ICmpInst>(&I)) {
+      specStoreSelect = icmp->getOperand(0);
+      cmpSlt = icmp->clone();
+      cmpSlt->setName("cmp_slt");
+      cmpSlt->insertAfter(insertPoint);
+      break;
+    }
+  }
+  assert(specStoreSelect && "Failed to find icmp instruction in ForBody");
+
+  // Replace the existing icmp in clonedForCondPreheader
+  for (Instruction &I : *clonedForCondPreheader) {
+    if (auto *icmp = dyn_cast<ICmpInst>(&I)) {
+      icmp->replaceAllUsesWith(cmpSlt);
+      icmp->eraseFromParent();
+      break;
+    }
+  }
+
+  // Set the operand of cmp_slt to the first cloned PHI node
+  cmpSlt->setOperand(0, clonedPhiNodes[0]);
+
+  // Update the successor of clonedForCondPreheader
+  clonedForCondPreheader->getTerminator()->setSuccessor(1, forCondPreheader);
+}
+
+static std::tuple<Value *, Value *, Value *>
+modifyForBodyPreheader(BasicBlock *ForBodyPreheader,
+                       BasicBlock *ClonedForCondPreheader) {
+  PHINode *TargetPHI = nullptr;
+  PHINode *TargetPHI2 = nullptr;
+  PHINode *TargetPHI3 = nullptr;
+  for (Instruction &I : *ClonedForCondPreheader) {
+    if (auto *phi = dyn_cast<PHINode>(&I)) {
+      if (phi->getType()->isIntegerTy(32)) {
+        if (isIncomingValueZeroOfPhi(phi)) {
+          // Found the target PHI node
+          TargetPHI = phi;
+        } else {
+          TargetPHI2 = phi;
+        }
+      } else if (phi->getType()->isFloatTy()) {
+        if (TargetPHI3 == nullptr) {
+          TargetPHI3 = phi;
+          break;
+        }
+      }
+    }
+  }
+  BinaryOperator *NewSub = nullptr;
+  for (Instruction &I : *ForBodyPreheader) {
+    if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+      if (BinOp->getOpcode() == Instruction::Sub) {
+        // Change to add
+        NewSub = BinaryOperator::CreateAdd(BinOp->getOperand(0), TargetPHI,
+                                           BinOp->getName(), BinOp);
+        BinOp->replaceAllUsesWith(NewSub);
+        BinOp->eraseFromParent();
+        break;
+      }
+    }
+  }
+
+  ForBodyPreheader->moveAfter(ClonedForCondPreheader);
+  assert(NewSub && "NewSub should not be nullptr");
+  return std::make_tuple(NewSub, TargetPHI2, TargetPHI3);
+}
+
+static Value *expandForCondPreheader(
+    BasicBlock *ForBody, BasicBlock *ForCondPreheader,
+    BasicBlock *ClonedForCondPreheader,
+    std::tuple<Value *, Value *, Value *> NewSubAndTargetPHI3) {
+  Instruction *TargetInst =
+      getFirstCallInstWithName(ForBody, "llvm.fmuladd.f32");
+  assert(TargetInst && "TargetInst not found");
+  Value *NewSub = std::get<0>(NewSubAndTargetPHI3);
+  Value *TargetPHI2 = std::get<1>(NewSubAndTargetPHI3);
+  Value *TargetPHI3 = std::get<2>(NewSubAndTargetPHI3);
+  // Create new .loopexit basic block
+  BasicBlock *LoopExit = BasicBlock::Create(
+      ForCondPreheader->getContext(), ForCondPreheader->getName() + ".loopexit",
+      ForCondPreheader->getParent(), ForCondPreheader);
+
+  // Create new sub instruction in .loopexit block
+  IRBuilder<> Builder(LoopExit);
+  Value *NewSubInst = Builder.CreateSub(NewSub, TargetPHI2);
+
+  // Add unconditional branch to ForCondPreheader
+  Builder.CreateBr(ForCondPreheader);
+
+  // Find the target PHI node in ClonedForCondPreheader
+  PHINode *TargetPHI = nullptr;
+  for (PHINode &Phi : ClonedForCondPreheader->phis()) {
+    if (isIncomingValueZeroOfPhi(&Phi)) {
+      TargetPHI = &Phi;
+      break;
+    }
+  }
+
+  // Ensure we found the target PHI node
+  assert(TargetPHI &&
+         "Failed to find target PHI node in ClonedForCondPreheader");
+
+  // Update the incoming value of the PHI nodes in ForCondPreheader to the
+  // result of the new sub instruction
+  for (PHINode &Phi : ForCondPreheader->phis()) {
+    if (Phi.getType()->isIntegerTy(32)) {
+      Phi.setIncomingValue(0, TargetPHI);
+      Phi.setIncomingBlock(0, ClonedForCondPreheader);
+      Phi.setIncomingValue(1, NewSubInst);
+      Phi.setIncomingBlock(1, LoopExit);
+    } else if (Phi.getType()->isFloatTy()) {
+      Phi.setIncomingValue(0, TargetPHI3);
+      Phi.setIncomingBlock(0, ClonedForCondPreheader);
+      // Phi.setIncomingValue(1, TargetInst);
+      Phi.setIncomingBlock(1, LoopExit);
+    }
+  }
+
+  // Get the icmp instruction in ForCondPreheader
+  ICmpInst *icmpInst = getFirstICmpInst(ForCondPreheader);
+
+  // Ensure we found the icmp instruction
+  assert(icmpInst && "Failed to find icmp instruction in ForCondPreheader");
+
+  // Set the operand 1 of icmpInst to constant 7
+  LLVMContext &Ctx = ForCondPreheader->getContext();
+  Value *const7 = ConstantInt::get(Type::getInt32Ty(Ctx), 7);
+  icmpInst->setOperand(1, const7);
+
+  // Create a new add nsw instruction before icmpInst, with operand 0 the same
+  // as icmpInst, and operand 1 as -7. This instruction will be used as the
+  // return value of the function
+  Value *constNeg7 = ConstantInt::get(Type::getInt32Ty(Ctx), -7);
+  IRBuilder<> BuilderBeforeICmp(icmpInst);
+  Value *AddInst =
+      BuilderBeforeICmp.CreateNSWAdd(icmpInst->getOperand(0), constNeg7);
+
+  ForBody->getTerminator()->setSuccessor(0, LoopExit);
+
+  return AddInst;
+}
+
+static void updateRealForBody(Function &F, Value *sub) {
+  BasicBlock *ForBody = getBasicBlockByName(F, "for.body");
+  assert(ForBody && "Expected to find for.body!");
+  ICmpInst *lastICmp =
+      getLastICmpInstWithPredicate(ForBody, ICmpInst::ICMP_SLT);
+  if (lastICmp) {
+    lastICmp->setOperand(1, sub);
+  }
+}
+
+static void modifyForBody(BasicBlock *ClonedForCondPreheader,
+                          BasicBlock *ForBody) {
+  // Find the unique float type PHI node in ForBody
+  PHINode *FloatPhiInForBody = getFirstFloatPhi(ForBody);
+  assert(FloatPhiInForBody && "Failed to find float type PHI node in ForBody");
+  // Find the first float type PHI node in ClonedForCondPreheader
+  PHINode *FirstFloatPhiInClonedForCondPreheader =
+      getFirstFloatPhi(ClonedForCondPreheader);
+  assert(FloatPhiInForBody && "Failed to find float type PHI node in ForBody");
+  // Set the incoming value of the float type PHI node in ForBody to the float
+  // type PHI node in ClonedForCondPreheader
+  FloatPhiInForBody->setIncomingValue(0, FirstFloatPhiInClonedForCondPreheader);
+
+  // Find the unique icmp eq instruction in ForBody
+  ICmpInst *IcmpEq = getFirstICmpInstWithPredicate(ForBody, ICmpInst::ICMP_EQ);
+
+  // Ensure we found the icmp eq instruction
+  assert(IcmpEq && "Failed to find icmp eq instruction in ForBody");
+
+  // Get the original operand 1
+  Value *OriginalOperand1 = IcmpEq->getOperand(1);
+
+  // Ensure the original operand 1 is an instruction
+  if (Instruction *OriginalOperand1Inst =
+          dyn_cast<Instruction>(OriginalOperand1)) {
+    // Set operand 1 to the operand 0 of the original operand 1 instruction
+    IcmpEq->setOperand(1, OriginalOperand1Inst->getOperand(0));
+  } else {
+    assert(false && "The original operand 1 is not an instruction, "
+                    "cannot get its operand 0\n");
+  }
+
+  // Find the phi i32 incoming value that is a variable in
+  // ClonedForCondPreheader
+  PHINode *TargetPHI = nullptr;
+  PHINode *TargetPHI2 = nullptr;
+  for (Instruction &I : *ClonedForCondPreheader) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+      if (isIncomingValueZeroOfPhi(Phi)) {
+        TargetPHI = Phi;
+      } else {
+        TargetPHI2 = Phi;
+      }
+      if (TargetPHI && TargetPHI2)
+        break;
+    }
+  }
+
+  // Ensure we found the target PHI node
+  assert(TargetPHI &&
+         "Failed to find the target PHI node in ClonedForCondPreheader");
+
+  // Find the phi i32 incoming value that is a variable in ForBody
+  PHINode *TargetPHIInForBody = nullptr;
+  PHINode *TargetPHIInForBody2 = nullptr;
+  for (Instruction &I : *ForBody) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+      if (isIncomingValueZeroOfPhi(Phi)) {
+        TargetPHIInForBody = Phi;
+      } else {
+        TargetPHIInForBody2 = Phi;
+      }
+      if (TargetPHIInForBody && TargetPHIInForBody2)
+        break;
+    }
+  }
+
+  // Ensure that the target PHI nodes are found
+  assert(TargetPHIInForBody && TargetPHIInForBody2 &&
+         "Failed to find matching PHI nodes in ForBody");
+
+  // Set the incoming value of the PHI nodes found in ForBody
+  // to the PHI nodes found in ClonedForCondPreheader
+  TargetPHIInForBody->setIncomingValue(0, TargetPHI);
+  TargetPHIInForBody2->setIncomingValue(0, TargetPHI2);
+
+  IcmpEq->setOperand(0, TargetPHIInForBody2->getIncomingValue(1));
+}
+
+static void insertUnusedInstructionsBeforeIcmp(PHINode *phiI32InClonedForBody,
+                                               ICmpInst *lastIcmpEq) {
+  for (Use &U : phiI32InClonedForBody->uses()) {
+    if (Instruction *Used = dyn_cast<Instruction>(U.getUser())) {
+      if (Used->getParent() == nullptr) {
+        if (Used->use_empty()) {
+          Used->insertBefore(lastIcmpEq);
+        }
+      }
+    }
+  }
+}
+
+static void modifyClonedForBody(BasicBlock *ClonedForBody) {
+
+  ICmpInst *lastIcmpEq = getLastICmpInst(ClonedForBody);
+  assert(lastIcmpEq &&
+         "Failed to find last icmp eq instruction in ClonedForBody");
+
+  PHINode *phiI32InClonedForBody = nullptr;
+  for (auto &Inst : *ClonedForBody) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&Inst)) {
+      if (isIncomingValueZeroOfPhi(Phi)) {
+        phiI32InClonedForBody = Phi;
+        insertUnusedInstructionsBeforeIcmp(phiI32InClonedForBody, lastIcmpEq);
+      }
+    }
+  }
+
+  // Ensure that the phi i32 node is found
+  assert(phiI32InClonedForBody && "phi i32 node not found in ClonedForBody");
+}
+
+static BasicBlock *getFirstSuccessorOfForBody(BasicBlock *ForBody) {
+  BasicBlock *ForCondPreheader = nullptr;
+  assert(succ_size(ForBody) == 2 && "ForBody should have 2 successors");
+  for (auto *succ : successors(ForBody)) {
+    ForCondPreheader = succ;
+    break;
+  }
+  return ForCondPreheader;
+}
+
+static std::tuple<BasicBlock *, BasicBlock *, BasicBlock *>
+cloneThreeBB(BasicBlock *ForBodyPreheader, BasicBlock *ForBody,
+             BasicBlock *ForCondPreheader, Function &F) {
+  ValueToValueMapTy VMap;
+  SmallVector<BasicBlock *, 2> NewBlocks;
+
+  BasicBlock *ClonedForBodyPreheader =
+      CloneBasicBlock(ForBodyPreheader, VMap, ".modify", &F);
+  BasicBlock *ClonedForBody = CloneBasicBlock(ForBody, VMap, ".modify", &F);
+  BasicBlock *ClonedForCondPreheader =
+      CloneBasicBlock(ForCondPreheader, VMap, ".modify", &F);
+
+  VMap[ForBodyPreheader] = ClonedForBodyPreheader;
+  VMap[ForBody] = ClonedForBody;
+  VMap[ForCondPreheader] = ClonedForCondPreheader;
+
+  // Remap instructions and PHI nodes in the new loop
+  remapInstructionsInBlocks(
+      {ClonedForBodyPreheader, ClonedForBody, ClonedForCondPreheader}, VMap);
+  return std::make_tuple(ClonedForBodyPreheader, ClonedForBody,
+                         ClonedForCondPreheader);
+}
+
+static std::tuple<BasicBlock *, BasicBlock *, Value *>
+modifyFirstForBody(Loop *L, Function &F, BasicBlock *ForBody, Value *sub) {
+
+  BasicBlock *ForBodyPreheader = L->getLoopPreheader();
+
+  // Find the predecessor of ForBodyPreheader
+  BasicBlock *PreForBody = nullptr;
+  assert(pred_size(ForBodyPreheader) == 1 &&
+         "ForBodyPreheader should have only one predecessor");
+  for (auto *Pred : predecessors(ForBodyPreheader)) {
+    PreForBody = Pred;
+  }
+
+  // Find the first successor of ForBody, it should have two
+  BasicBlock *ForCondPreheader = getFirstSuccessorOfForBody(ForBody);
+
+  std::tuple<BasicBlock *, BasicBlock *, BasicBlock *> ClonedBBs =
+      cloneThreeBB(ForBodyPreheader, ForBody, ForCondPreheader, F);
+  BasicBlock *ClonedForBodyPreheader = std::get<0>(ClonedBBs);
+  BasicBlock *ClonedForBody = std::get<1>(ClonedBBs);
+  BasicBlock *ClonedForCondPreheader = std::get<2>(ClonedBBs);
+
+  /* insert 2 cloned blocks between PreForBody and ForBody */
+  // for.body -> for.body12.lr.ph
+  PreForBody->getTerminator()->setSuccessor(0, ClonedForBodyPreheader);
+  ClonedForBodyPreheader->moveAfter(PreForBody);
+  // for.body12.lr.ph -> for.body12
+  ClonedForBodyPreheader->getTerminator()->setSuccessor(0, ClonedForBody);
+
+  // for.body12 -> for.cond59.preheader
+  ClonedForBody->moveAfter(ClonedForBodyPreheader);
+
+  // for.cond59.preheader -> for.body62.lr.ph
+  ClonedForCondPreheader->getTerminator()->setSuccessor(0, ForBodyPreheader);
+
+  // for.cond59.preheader -> for.cond71.preheader
+  ClonedForCondPreheader->getTerminator()->setSuccessor(1,
+                                                        ClonedForCondPreheader);
+  ClonedForCondPreheader->moveAfter(ClonedForBodyPreheader);
+  // for.body -> for.cond71.preheader
+  PreForBody->getTerminator()->setSuccessor(1, ClonedForCondPreheader);
+
+  preProcessClonedForBody(ClonedForBody, sub);
+  updateRealForBody(F, sub);
+  unrollClonedForBody(ClonedForBody, ClonedForCondPreheader, 0);
+  modifyClonedForBody(ClonedForBody);
+  unrollClonedForCondPreheader(ClonedForBody, ClonedForCondPreheader,
+                               ForCondPreheader);
+
+  modifyForBody(ClonedForCondPreheader, ForBody);
+  std::tuple<Value *, Value *, Value *> NewSubAndTargetPHI3 =
+      modifyForBodyPreheader(ForBodyPreheader, ClonedForCondPreheader);
+
+  Value *AddInst = expandForCondPreheader(
+      ForBody, ForCondPreheader, ClonedForCondPreheader, NewSubAndTargetPHI3);
+
+  ClonedForBodyPreheader->moveBefore(ClonedForBody);
+  groupAndReorderInstructions(ClonedForBody);
+  return std::make_tuple(ClonedForCondPreheader, ForCondPreheader, AddInst);
+}
+
+static bool moveIfEndToEnd(Function &F) {
+
+  BasicBlock &lastBB = F.back();
+  if (lastBB.getName() == "if.end") {
+    return false;
+  }
+
+  BasicBlock *ifEndBB = getBasicBlockByName(F, "if.end");
+  assert(ifEndBB && "Expected to find if.end!");
+  if (ifEndBB) {
+    ifEndBB->removeFromParent();
+    ifEndBB->insertInto(&F);
+  }
+  return true;
+}
+
+static Value *modifyForCondPreheader(Function &F) {
+  LLVMContext &Ctx = F.getContext();
+
+  BasicBlock *forCondPreheader = getBasicBlockByName(F, "for.cond.preheader");
+  BasicBlock *forBodyLrPh = getBasicBlockByName(F, "for.body.lr.ph");
+  assert(forCondPreheader && "Expected to find for.cond.preheader!");
+  assert(forBodyLrPh && "Expected to find for.body.lr.ph!");
+  forCondPreheader->replaceAllUsesWith(forBodyLrPh);
+  forCondPreheader->eraseFromParent();
+  forBodyLrPh->setName("for.cond.preheader");
+
+  unsigned int loadnum = 0;
+  for (auto I = forBodyLrPh->begin(); I != forBodyLrPh->end(); ++I) {
+    if (auto *loadinst = dyn_cast<LoadInst>(&*I)) {
+      loadnum++;
+      if (loadnum == 2) {
+        IRBuilder<> Builder(loadinst->getNextNode());
+        Value *NegSeven = ConstantInt::get(Type::getInt32Ty(Ctx), -7);
+        Value *Sub = Builder.CreateNSWAdd(loadinst, NegSeven, "sub");
+        return Sub; // Return the newly inserted instruction
+      }
+    }
+  }
+  assert(false && "it must not be here");
+}
+
+static void modifyForCondPreheader2(BasicBlock *ClonedForBody,
+                                    BasicBlock *ClonedForCondPreheader,
+                                    BasicBlock *ForCondPreheader,
+                                    Value *andinst) {
+
+  // Find phi instructions of float type in ClonedForBody
+  SmallVector<PHINode *> PhiNodes;
+  for (Instruction &I : *ClonedForBody) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+      PhiNodes.push_back(Phi);
+    }
+  }
+
+  // Clone the found phi instructions to the beginning of ClonedForCondPreheader
+  // in order
+  Instruction *InsertPoint = &ForCondPreheader->front();
+  PHINode *phi = cast<PHINode>(InsertPoint);
+
+  BasicBlock *lastForCondPreheader = phi->getIncomingBlock(0);
+  SmallVector<PHINode *> ClonedPhiNodes;
+  unsigned int floatphicount = 0;
+  for (PHINode *Phi : PhiNodes) {
+    PHINode *ClonedPhi = cast<PHINode>(Phi->clone());
+    ClonedPhi->setName(Phi->getName() + ".clone");
+    // Modify the operand 0 basicblock of each phi instruction to ForBody
+    if (Phi->getType()->isFloatTy()) {
+      if (floatphicount == 0) {
+        ClonedPhi->setIncomingValue(0, phi->getIncomingValue(0));
+        floatphicount++;
+      }
+    }
+    ClonedPhi->setIncomingBlock(0, lastForCondPreheader);
+    ClonedPhi->insertAfter(InsertPoint);
+    // Update the insertion point to after the newly inserted PHI node
+    InsertPoint = ClonedPhi;
+
+    ClonedPhiNodes.push_back(ClonedPhi);
+  }
+
+  // Find operand 1 of the icmp instruction from ClonedForBody
+  ICmpInst *firstIcmp = getFirstICmpInst(ClonedForBody);
+  assert(firstIcmp && "Unable to find icmp instruction in ClonedForBody");
+  Value *IcmpOperand1 = firstIcmp->getOperand(1);
+
+  // Set operand 0 of icmp in ForCondPreheader to ClonedPhiNodes[0], and operand
+  // 1 to IcmpOperand1
+  for (Instruction &I : *ForCondPreheader) {
+    if (ICmpInst *Icmp = dyn_cast<ICmpInst>(&I)) {
+      Icmp->setOperand(0, ClonedPhiNodes[0]);
+      Icmp->setOperand(1, IcmpOperand1);
+      Icmp->setName("cmp");
+      break;
+    }
+  }
+
+  ForCondPreheader->getTerminator()->setSuccessor(1, ClonedForCondPreheader);
+
+  // // Delete redundant getelementptr, store and add instructions
+  SmallVector<Instruction *> InstructionsToRemove;
+  for (Instruction &I : *ForCondPreheader) {
+    if (isa<GetElementPtrInst>(&I) || isa<StoreInst>(&I) ||
+        isa<BinaryOperator>(&I)) {
+      InstructionsToRemove.push_back(&I);
+    }
+  }
+  for (auto Inst = InstructionsToRemove.rbegin();
+       Inst != InstructionsToRemove.rend(); ++Inst) {
+    if ((*Inst)->use_empty()) {
+      (*Inst)->eraseFromParent();
+    }
+  }
+  // Find the icmp instruction in ClonedForCondPreheader
+  ICmpInst *IcmpInForCondPreheader =
+      getFirstICmpInstWithPredicate(ForCondPreheader, ICmpInst::ICMP_EQ);
+
+  // Ensure that the icmp instruction is found
+  assert(IcmpInForCondPreheader &&
+         "icmp instruction not found in ClonedForCondPreheader");
+
+  // Get the original operand 1
+  Value *OriginalOperand1 = IcmpInForCondPreheader->getOperand(1);
+
+  // If the original operand 1 is an instruction, get its operand 0
+  if (Instruction *OriginalOperand1Inst =
+          dyn_cast<Instruction>(OriginalOperand1)) {
+    Value *NewOperand1 = OriginalOperand1Inst->getOperand(0);
+
+    // Set the new operand 1
+    IcmpInForCondPreheader->setOperand(1, NewOperand1);
+    // Change the original eq to slt
+
+    IcmpInForCondPreheader->setPredicate(CmpInst::ICMP_SLT);
+
+  } else {
+    assert(false && "The original operand 1 is not an instruction, cannot get "
+                    "its operand 0\n");
+  }
+
+  // Find phi i32 node in ForCondPreheader with incoming 0 value == 0
+  PHINode *TargetPhi = nullptr;
+  for (Instruction &I : *ForCondPreheader) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+      if (isIncomingValueZeroOfPhi(Phi)) {
+        TargetPhi = Phi;
+        break;
+      }
+    }
+  }
+
+  // Ensure the target phi node is found
+  assert(TargetPhi && "No matching phi i32 node found in ForCondPreheader");
+
+  TargetPhi->setIncomingValue(1, andinst);
+}
+
+static Value *modifyClonedForBodyPreheader(BasicBlock *ClonedForBodyPreheader,
+                                           BasicBlock *ForBody) {
+  ICmpInst *firstIcmp = getFirstICmpInst(ForBody);
+  assert(firstIcmp && "Unable to find icmp instruction in ForBody");
+
+  Value *IcmpOperand1 = firstIcmp->getOperand(1);
+
+  IRBuilder<> Builder(ClonedForBodyPreheader->getTerminator());
+  Value *AndInst =
+      Builder.CreateAnd(IcmpOperand1, Builder.getInt32(2147483640));
+  return AndInst;
+}
+
+static void modifyClonedForCondPreheader(BasicBlock *ClonedForCondPreheader,
+                                         BasicBlock *ForBody,
+                                         BasicBlock *ForCondPreheader) {
+
+  // Find float type phi node in ForBody
+  PHINode *FloatPhiInForBody = nullptr;
+  for (Instruction &I : *ForBody) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+      if (Phi->getType()->isFloatTy()) {
+        FloatPhiInForBody = cast<PHINode>(I.clone());
+        break;
+      }
+    }
+  }
+
+  // Find and replace float type phi node in ClonedForCondPreheader
+  if (FloatPhiInForBody) {
+    PHINode *phi = getFirstFloatPhi(ClonedForCondPreheader);
+    assert(phi && "phi node not found");
+    FloatPhiInForBody->insertBefore(phi);
+    phi->replaceAllUsesWith(FloatPhiInForBody);
+    phi->eraseFromParent();
+  }
+
+  // Set incomingblock 0 of FloatPhiInForBody to ForCondPreheader
+  if (FloatPhiInForBody) {
+    FloatPhiInForBody->setIncomingBlock(0, ForCondPreheader);
+  }
+
+  // Find float type phi nodes in ForCondPreheader
+  SmallVector<PHINode *> FloatPhisInForCondPreheader;
+  for (Instruction &I : *ForCondPreheader) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+      if (Phi->getType()->isFloatTy()) {
+        FloatPhisInForCondPreheader.push_back(Phi);
+      }
+    }
+  }
+
+  // Create 7 fadd instructions
+  Value *LastFAdd = nullptr;
+  if (FloatPhisInForCondPreheader.size() >= 8) {
+    IRBuilder<> Builder(FloatPhiInForBody->getNextNode());
+
+    Value *PrevAdd = getFirstFloatPhi(ClonedForCondPreheader);
+
+    assert(PrevAdd &&
+           "Unable to find float type PHI node in ClonedForCondPreheader");
+    Value *Add139 =
+        Builder.CreateFAdd(PrevAdd, FloatPhisInForCondPreheader[2], "add139");
+    Value *Add140 =
+        Builder.CreateFAdd(FloatPhisInForCondPreheader[3],
+                           FloatPhisInForCondPreheader[4], "add140");
+    Value *Add141 =
+        Builder.CreateFAdd(FloatPhisInForCondPreheader[5],
+                           FloatPhisInForCondPreheader[6], "add141");
+    Value *Add142 =
+        Builder.CreateFAdd(FloatPhisInForCondPreheader[7],
+                           FloatPhisInForCondPreheader[8], "add142");
+    Value *Add143 = Builder.CreateFAdd(Add139, Add140, "add143");
+    Value *Add144 = Builder.CreateFAdd(Add141, Add142, "add144");
+    Value *Add145 = Builder.CreateFAdd(Add143, Add144, "add145");
+    LastFAdd = Add145;
+  } else {
+    llvm_unreachable("Unable to find float type PHI node in ForCondPreheader");
+  }
+
+  // Find store instruction in ForCondPreheader and update its operand
+  if (LastFAdd) {
+    for (auto &Inst : *ClonedForCondPreheader) {
+      if (auto *si = dyn_cast<StoreInst>(&Inst)) {
+        si->setOperand(0, LastFAdd);
+        break;
+      }
+    }
+  }
+
+  Value *addinst = nullptr;
+  // Iterate through instructions in ClonedForCondPreheader, looking for addnuw
+  // instruction
+  for (auto &Inst : *ClonedForCondPreheader) {
+    if (auto *AddInst = dyn_cast<BinaryOperator>(&Inst)) {
+      if (AddInst->getOpcode() == Instruction::Add &&
+          AddInst->hasNoUnsignedWrap()) {
+        addinst = AddInst;
+        break;
+      }
+    }
+  }
+  // Get the second successor of ClonedForCondPreheader
+  BasicBlock *SecondSuccessor = nullptr;
+  int SuccCount = 0;
+  for (auto *Succ : successors(ClonedForCondPreheader)) {
+    if (SuccCount == 1) {
+      SecondSuccessor = Succ;
+      break;
+    }
+    SuccCount++;
+  }
+
+  if (SecondSuccessor && addinst) {
+    // Iterate through all PHI nodes in SecondSuccessor
+    int phiCount = 0;
+    for (PHINode &Phi : SecondSuccessor->phis()) {
+      if (phiCount == 1) { // Second phi node
+        // Set the second predecessor to ClonedForCondPreheader and its value to
+        // addinst
+        Phi.setIncomingBlock(1, ClonedForCondPreheader);
+        Phi.setIncomingValue(1, addinst);
+      } else {
+        // For other phi nodes, only update the predecessor basic block
+        Phi.setIncomingBlock(1, ClonedForCondPreheader);
+      }
+      phiCount++;
+    }
+  }
+}
+
+static void modifyClonedForBody2(BasicBlock *ClonedForBody,
+                                 BasicBlock *ClonedForCondPreheader,
+                                 Value *AddInst, BasicBlock *ForCondPreheader) {
+  SmallVector<PHINode *> floatPhiNodes;
+
+  // Iterate through all instructions in ClonedForCondPreheader
+  for (Instruction &I : *ClonedForCondPreheader) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+      if (Phi->getType()->isFloatTy()) {
+        floatPhiNodes.push_back(Phi);
+        if (floatPhiNodes.size() == 8) {
+          break; // Stop after finding 8 float type PHI nodes
+        }
+      }
+    }
+  }
+
+  // Ensure we found 8 float type PHI nodes
+  assert(floatPhiNodes.size() == 8 &&
+         "Unable to find 8 float type PHI nodes in ClonedForCondPreheader");
+
+  // Now floatPhiNodes contains 8 float type PHI nodes in order
+
+  // Iterate through all PHI nodes in ClonedForBody
+  int phiIndex = 0;
+  for (PHINode &Phi : ClonedForBody->phis()) {
+    if (Phi.getType()->isFloatTy()) {
+      // Ensure we don't access floatPhiNodes out of bounds
+      if (phiIndex < floatPhiNodes.size()) {
+        // Set the 0th incoming value of the PHI node to the corresponding node
+        // in floatPhiNodes
+        if (phiIndex >
+            0) { // Don't set the first phi node, as it's floatPhiInForBody
+          Phi.setIncomingValue(0, floatPhiNodes[phiIndex]);
+        }
+        phiIndex++;
+      } else {
+        // If the number of float type PHI nodes in ClonedForBody exceeds the
+        // size of floatPhiNodes, output a warning
+        assert(false && "Warning: Number of float type PHI nodes in "
+                        "ClonedForBody exceeds expectations\n");
+        break;
+      }
+    }
+  }
+
+  // Ensure we processed all expected PHI nodes
+  if (phiIndex < floatPhiNodes.size()) {
+    assert(false && "Warning: Number of float type PHI nodes in ClonedForBody "
+                    "is less than expected\n");
+  }
+
+  // Find the last icmp eq instruction in ClonedForBody
+  ICmpInst *lastIcmpEq =
+      getLastICmpInstWithPredicate(ClonedForBody, ICmpInst::ICMP_EQ);
+
+  // Ensure we found the icmp eq instruction
+  assert(lastIcmpEq && "Unable to find icmp eq instruction in ClonedForBody");
+
+  // Set operand 1 to addInst
+  lastIcmpEq->setOperand(1, AddInst);
+  // Change the predicate of the icmp eq instruction to slt (signed less than)
+  lastIcmpEq->setPredicate(ICmpInst::ICMP_SLT);
+  // Change the name to cmp
+  lastIcmpEq->setName("cmp");
+
+  ClonedForBody->getTerminator()->setSuccessor(1, ForCondPreheader);
+
+  // Find phi i32 node in ClonedForBody
+  PHINode *phiI32InClonedForBody = nullptr;
+  for (auto &Inst : *ClonedForBody) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&Inst)) {
+      if (Phi->getType()->isIntegerTy(32)) {
+        phiI32InClonedForBody = Phi;
+        insertUnusedInstructionsBeforeIcmp(phiI32InClonedForBody, lastIcmpEq);
+      }
+    }
+  }
+
+  // Ensure we found the phi i32 node
+  assert(phiI32InClonedForBody &&
+         "Unable to find phi i32 node in ClonedForBody");
+}
+
+static std::pair<PHINode *, PHINode *> findTwoI32PhiInBB(BasicBlock *ForBody) {
+  // Find the first i32 type PHI instruction in ForBody
+  PHINode *firstI32PhiInBB = nullptr;
+  PHINode *secondI32PhiInBB = nullptr;
+  int i32PhiCount2 = 0;
+  for (auto &Inst : *ForBody) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&Inst)) {
+      if (Phi->getType()->isIntegerTy(32)) {
+        if (i32PhiCount2 == 0) {
+          firstI32PhiInBB = Phi;
+          i32PhiCount2++;
+        } else if (i32PhiCount2 == 1) {
+          secondI32PhiInBB = Phi;
+          break;
+        }
+      }
+    }
+  }
+
+  // Ensure we found two i32 type PHI instructions in ForBody
+  assert(firstI32PhiInBB && secondI32PhiInBB &&
+         "Unable to find two i32 type PHI instructions in BB");
+
+  return std::make_pair(firstI32PhiInBB, secondI32PhiInBB);
+}
+static void modifyForBody2(BasicBlock *ClonedForCondPreheader,
+                           BasicBlock *ForBody, BasicBlock *ForCondPreheader) {
+  // Find the first i32 type PHI instruction in ForCondPreheader
+  auto [firstI32PhiInForCondPreheader, secondI32PhiInForCondPreheader] =
+      findTwoI32PhiInBB(ForCondPreheader);
+
+  // Find the first i32 type PHI instruction in ForBody
+  auto [firstI32PhiInForBody, secondI32PhiInForBody] =
+      findTwoI32PhiInBB(ForBody);
+
+  // Set the incoming 0 value of the two i32 type PHI instructions found in
+  // ForBody to the firstI32Phi found in ForCondPreheader
+  firstI32PhiInForBody->setIncomingValue(0, firstI32PhiInForCondPreheader);
+  secondI32PhiInForBody->setIncomingValue(0, secondI32PhiInForCondPreheader);
+
+  ForBody->getTerminator()->setSuccessor(0, ClonedForCondPreheader);
+
+  // Find the first float type PHI instruction in ForCondPreheader
+  PHINode *SecondFloatPhiInForCondPreheader = nullptr;
+  int floatPhiCount = 0;
+  for (auto &Inst : *ForCondPreheader) {
+    if (PHINode *Phi = dyn_cast<PHINode>(&Inst)) {
+      if (Phi->getType()->isFloatTy()) {
+        floatPhiCount++;
+        if (floatPhiCount == 2) {
+          SecondFloatPhiInForCondPreheader = Phi;
+          break;
+        }
+      }
+    }
+  }
+
+  // Ensure we found a float type PHI instruction in ForCondPreheader
+  assert(SecondFloatPhiInForCondPreheader &&
+         "Unable to find float type PHI instruction in ForCondPreheader");
+
+  // Find the only float type PHI instruction in ForBody
+  PHINode *FloatPhiInForBody = getFirstFloatPhi(ForBody);
+  assert(FloatPhiInForBody && "Unable to find float type PHI instruction in "
+                              "ForBody");
+
+  // Set incoming value 0 of the float type PHI instruction in ForBody
+  FloatPhiInForBody->setIncomingValue(0, SecondFloatPhiInForCondPreheader);
+
+  // Find the unique float type PHI instruction in ClonedForCondPreheader
+  PHINode *FloatPhiInClonedForCondPreheader =
+      getFirstFloatPhi(ClonedForCondPreheader);
+  assert(FloatPhiInClonedForCondPreheader &&
+         "Float type PHI instruction not found in ClonedForCondPreheader");
+
+  // Set incoming value 0 of the float type PHI instruction in
+  // ClonedForCondPreheader
+  FloatPhiInClonedForCondPreheader->setIncomingValue(
+      0, SecondFloatPhiInForCondPreheader);
+}
+
+// Helper function to run dead code elimination
+static void runDeadCodeElimination(Function &F) {
+  legacy::FunctionPassManager FPM(F.getParent());
+  FPM.add(createDeadCodeEliminationPass());
+  FPM.run(F);
+  LLVM_DEBUG(F.dump());
+}
+
+static bool modifySecondForBody(Loop *L, Function &F, BasicBlock *ForBody,
+                                BasicBlock *FirstClonedForCondPreheader,
+                                BasicBlock *FirstForCondPreheader,
+                                Value *AddInst) {
+  BasicBlock *ForBodyPreheader = L->getLoopPreheader();
+
+  // Find the 0th successor of ForBody, it should have two
+  BasicBlock *ForCondPreheader = getFirstSuccessorOfForBody(ForBody);
+
+  std::tuple<BasicBlock *, BasicBlock *, BasicBlock *> ClonedBBs =
+      cloneThreeBB(ForBodyPreheader, ForBody, ForCondPreheader, F);
+  BasicBlock *ClonedForBodyPreheader = std::get<0>(ClonedBBs);
+  BasicBlock *ClonedForBody = std::get<1>(ClonedBBs);
+  BasicBlock *ClonedForCondPreheader = std::get<2>(ClonedBBs);
+
+  ClonedForCondPreheader->setName("for.end");
+  ClonedForBody->moveBefore(ForBody);
+  ClonedForBodyPreheader->moveBefore(ClonedForBody);
+  ForCondPreheader->moveBefore(ClonedForBodyPreheader);
+  ClonedForCondPreheader->moveAfter(ForBody);
+  ForCondPreheader->getTerminator()->setSuccessor(0, ForBodyPreheader);
+
+  unrollClonedForBody(ClonedForBody, ClonedForCondPreheader, 1);
+  modifyClonedForBody2(ClonedForBody, FirstClonedForCondPreheader, AddInst,
+                       ForCondPreheader);
+
+  Value *andinst =
+      modifyClonedForBodyPreheader(ClonedForBodyPreheader, ForBody);
+  modifyForCondPreheader2(ClonedForBody, ClonedForCondPreheader,
+                          ForCondPreheader, andinst);
+  modifyClonedForCondPreheader(ClonedForCondPreheader, ForBody,
+                               ForCondPreheader);
+  modifyForBody2(ClonedForCondPreheader, ForBody, ForCondPreheader);
+
+  FirstForCondPreheader->getTerminator()->setSuccessor(0,
+                                                       ClonedForBodyPreheader);
+
+  // Run Dead Code Elimination optimization
+  runDeadCodeElimination(F);
+
+  groupAndReorderInstructions(ClonedForBody);
+
+  return true;
+}
+static void insertDoublePreheader(Function &F) {
+  BasicBlock *entry = &F.getEntryBlock();
+  BasicBlock *ifend = &F.back();
+  BasicBlock *entry_successor1 = entry->getTerminator()->getSuccessor(1);
+
+  // Create a new basic block
+  BasicBlock *newBB = BasicBlock::Create(
+      F.getContext(), entry_successor1->getName() + ".preheader", &F,
+      entry_successor1);
+
+  Value *len = getLenFromEntryBlock(F);
+
+  // Insert instructions in the new basic block
+  IRBuilder<> builder(newBB);
+  Value *cmp151349 = builder.CreateICmpSGT(
+      len, ConstantInt::get(len->getType(), 0), "cmp151349");
+
+  // Create a conditional branch
+  builder.CreateCondBr(cmp151349, entry_successor1, ifend);
+
+  // Modify the terminator of entry to jump to the new basic block
+  entry->getTerminator()->setSuccessor(1, newBB);
+}
+static bool unrollFir(Function &F, Loop *L) {
+
+  bool Changed = false;
+  static BasicBlock *FirstClonedForCondPreheader = nullptr;
+  static BasicBlock *FirstForCondPreheader = nullptr;
+  static Value *AddInst = nullptr;
+
+  for (auto *BB : L->blocks()) {
+
+    assert(BB->getName().contains("for.body") && "BB must is for.body");
+    Changed = moveIfEndToEnd(F);
+    // Temporarily skip processing the second loop
+
+    if (Changed) {
+      insertDoublePreheader(F);
+      Value *sub = modifyForCondPreheader(F);
+      std::tuple<BasicBlock *, BasicBlock *, Value *> result =
+          modifyFirstForBody(L, F, BB, sub);
+      FirstClonedForCondPreheader = std::get<0>(result);
+      FirstForCondPreheader = std::get<1>(result);
+      AddInst = std::get<2>(result);
+    } else {
+      modifySecondForBody(L, F, BB, FirstClonedForCondPreheader,
+                          FirstForCondPreheader, AddInst);
+    }
+  }
+  LLVM_DEBUG(F.dump());
+
+  return Changed;
+}
+
+// Preprocessing function
+static PHINode *preprocessClonedForBody(BasicBlock *ClonedForBody) {
+  // Find the unique PHI node
+  PHINode *phiNode = nullptr;
+  for (auto &I : *ClonedForBody) {
+    if (auto *phi = dyn_cast<PHINode>(&I)) {
+      phiNode = phi;
+      break;
+    }
+  }
+
+  // Ensure that the PHI node is found
+  assert(phiNode && "PHI node not found");
+
+  // Find two mul nsw instructions
+  SmallVector<BinaryOperator *> mulInsts;
+  for (auto &I : *ClonedForBody) {
+    if (auto *binOp = dyn_cast<BinaryOperator>(&I)) {
+      if (binOp->getOpcode() == Instruction::Mul && binOp->hasNoSignedWrap()) {
+        mulInsts.push_back(binOp);
+      }
+    }
+  }
+
+  // Replace mul nsw instructions with the PHI node
+  for (auto *mulInst : mulInsts) {
+    mulInst->replaceAllUsesWith(phiNode);
+    mulInst->eraseFromParent();
+  }
+  return phiNode;
+}
+
+static Instruction *modifyAddToOrInClonedForBody(BasicBlock *ClonedForBody) {
+  // Find the unique add nuw nsw instruction
+  Instruction *addInst = nullptr;
+  for (auto &I : *ClonedForBody) {
+    if (auto *binOp = dyn_cast<BinaryOperator>(&I)) {
+      if (binOp->getOpcode() == Instruction::Add &&
+          binOp->hasNoUnsignedWrap()) {
+        addInst = binOp;
+        break;
+      }
+    }
+  }
+
+  // Ensure that the add nuw nsw instruction is found
+  assert(addInst && "add nuw nsw instruction not found");
+
+  // Create a new or disjoint instruction
+  Instruction *orInst = BinaryOperator::CreateDisjoint(
+      Instruction::Or, addInst->getOperand(0),
+      ConstantInt::get(addInst->getType(), 1), "add", addInst);
+
+  // Replace all uses of the add instruction
+  addInst->replaceAllUsesWith(orInst);
+
+  // Delete the original add instruction
+  addInst->eraseFromParent();
+  orInst->setName("add");
+  return orInst;
+}
+
+static void modifyAddToOr(BasicBlock *ClonedForBody) {
+  SmallVector<BinaryOperator *> addInsts;
+
+  // Collect all add instructions that meet the criteria
+  for (auto &I : *ClonedForBody) {
+    if (auto *binOp = dyn_cast<BinaryOperator>(&I)) {
+      if (binOp->getOpcode() == Instruction::Add) {
+        addInsts.push_back(binOp);
+      }
+    }
+  }
+  if (addInsts.empty()) {
+    return;
+  }
+  // Replace each add instruction with an or disjoint instruction
+  for (auto it = addInsts.begin(); it != std::prev(addInsts.end()); ++it) {
+    auto *addInst = *it;
+    // Create a new or disjoint instruction
+    Instruction *orInst =
+        BinaryOperator::CreateDisjoint(Instruction::Or, addInst->getOperand(0),
+                                       addInst->getOperand(1), "add", addInst);
+
+    // Replace all uses of the add instruction
+    addInst->replaceAllUsesWith(orInst);
+
+    // Delete the original add instruction
+    addInst->eraseFromParent();
+    orInst->setName("add");
+  }
+}
+
+static Value *unrolladdcClonedForBody(BasicBlock *ClonedForBody,
+                                      int unroll_factor) {
+
+  // Call the preprocessing function
+  PHINode *phiNode = preprocessClonedForBody(ClonedForBody);
+
+  // Replace add instructions with or instructions
+  Instruction *orInst = modifyAddToOrInClonedForBody(ClonedForBody);
+
+  // Find the first non-PHI instruction and or instruction
+  Instruction *firstNonPHI = ClonedForBody->getFirstNonPHI();
+
+  // Ensure that the start and end instructions are found
+  assert(firstNonPHI && orInst && "Start or end instruction not found");
+
+  // Find the icmp instruction
+  Instruction *icmpInst = getFirstICmpInst(ClonedForBody);
+
+  // Ensure that the icmp instruction is found
+  assert(icmpInst && "icmp instruction not found");
+
+  // Print information about the icmp instruction
+
+  Instruction *newOrInst = orInst;
+  // Copy instructions 15 times
+  for (int i = 1; i <= (unroll_factor - 1); i++) {
+    ValueToValueMapTy VMap;
+    for (auto it = firstNonPHI->getIterator(); &*it != orInst; ++it) {
+      Instruction *newInst = it->clone();
+      // For getelementptr instructions, set the second operand to orInst
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(newInst)) {
+        newInst->setOperand(1, newOrInst);
+        newInst->setName("arrayidx");
+      }
+      // If it's a fadd instruction, change its name to add
+      if (newInst->getOpcode() == Instruction::FAdd) {
+        newInst->setName("add");
+      }
+      VMap[&*it] = newInst;
+      newInst->insertBefore(icmpInst);
+    }
+
+    // Update operands of new instructions
+    for (auto it = firstNonPHI->getIterator(); &*it != orInst; ++it) {
+      Instruction *newInst = cast<Instruction>(VMap[&*it]);
+      for (unsigned j = 0; j < newInst->getNumOperands(); j++) {
+        Value *op = newInst->getOperand(j);
+        if (VMap.count(op)) {
+          newInst->setOperand(j, VMap[op]);
+        }
+      }
+    }
+    // Clone orInst and insert before icmpInst
+    newOrInst = orInst->clone();
+    // Set the second operand of newOrInst to i+1
+    newOrInst->setOperand(1, ConstantInt::get(newOrInst->getType(), i + 1));
+    newOrInst->setName("add");
+    newOrInst->insertBefore(icmpInst);
+    VMap[orInst] = newOrInst;
+  }
+
+  // Replace or instruction with add nuw nsw instruction
+  IRBuilder<> Builder(newOrInst);
+  Value *newAddInst =
+      Builder.CreateNUWAdd(newOrInst->getOperand(0), newOrInst->getOperand(1));
+  newOrInst->replaceAllUsesWith(newAddInst);
+  newOrInst->eraseFromParent();
+
+  // Create a new add instruction, subtracting 16 from len
+  Builder.SetInsertPoint(icmpInst);
+  Value *len = icmpInst->getOperand(1);
+  Value *sub = Builder.CreateNSWAdd(
+      len, ConstantInt::get(len->getType(), -unroll_factor), "sub");
+  // Set the icmp instruction's predicate to sgt, and operands to newAddInst
+  if (ICmpInst *icmp = dyn_cast<ICmpInst>(icmpInst)) {
+    icmp->setPredicate(ICmpInst::ICMP_SGT);
+    icmp->setOperand(0, newAddInst);
+    icmp->setOperand(1, sub);
+  }
+
+  phiNode->setIncomingValue(0, newAddInst);
+  return sub;
+}
+
+static void expandForCondPreheaderaddc(Function &F,
+                                       BasicBlock *ForCondPreheader,
+                                       BasicBlock *ClonedForBody,
+                                       BasicBlock *ForBody, Value *sub,
+                                       int unroll_factor) {
+  // Create a new ForCondPreheader after the original ForCondPreheader
+  BasicBlock *NewForCondPreheader = BasicBlock::Create(
+      ForCondPreheader->getContext(), "for.cond.preheader.new",
+      ForCondPreheader->getParent(), ForCondPreheader->getNextNode());
+  // Create a new empty BasicBlock after NewForCondPreheader
+  BasicBlock *NewForCondPreheader2 = BasicBlock::Create(
+      NewForCondPreheader->getContext(), "for.cond.preheader.new2",
+      NewForCondPreheader->getParent(), NewForCondPreheader->getNextNode());
+
+  // Move sub to the new ForCondPreheader
+  if (Instruction *SubInst = dyn_cast<Instruction>(sub)) {
+    SubInst->removeFromParent();
+    SubInst->insertInto(NewForCondPreheader, NewForCondPreheader->begin());
+  }
+
+  // Create new comparison instruction in NewForCondPreheader
+  IRBuilder<> Builder(NewForCondPreheader);
+  Value *len = getLenFromEntryBlock(F);
+
+  assert(len && "Parameter named 'len' not found");
+
+  Value *cmp6not207 = Builder.CreateICmpULT(
+      len, ConstantInt::get(len->getType(), unroll_factor), "cmp6.not207");
+
+  // Create conditional branch instruction
+  Builder.CreateCondBr(cmp6not207, NewForCondPreheader2, ClonedForBody);
+
+  // Find if.end basic block
+  BasicBlock *ifEndBB = getBasicBlockByName(F, "if.end");
+  BasicBlock *returnBB = getBasicBlockByName(F, "return");
+  assert(ifEndBB && "Expected to find if.end!");
+  assert(returnBB && "Expected to find return!");
+  // Get the terminator instruction of if.end
+  Instruction *terminator = ifEndBB->getTerminator();
+  if (!terminator) {
+    assert(false && "if.end basic block has no terminator instruction\n");
+    return;
+  }
+
+  // Replace the first operand of the terminator instruction with
+  // NewForCondPreheader
+  terminator->setOperand(2, NewForCondPreheader);
+
+  // Find the unique PHINode in clonedForBody
+  PHINode *uniquePHI = nullptr;
+  for (Instruction &I : *ClonedForBody) {
+    if (auto *phi = dyn_cast<PHINode>(&I)) {
+      if (uniquePHI) {
+        // If we've already found a PHINode but find another, it's not unique
+
+        uniquePHI = nullptr;
+        break;
+      }
+      uniquePHI = phi;
+    }
+  }
+
+  assert(uniquePHI && "No unique PHINode found in ForBody\n");
+
+  uniquePHI->setIncomingBlock(1, NewForCondPreheader);
+  auto *clonedphi = uniquePHI->clone();
+  clonedphi->insertInto(NewForCondPreheader2, NewForCondPreheader2->begin());
+
+  // Create comparison instruction
+  ICmpInst *cmp85209 =
+      new ICmpInst(ICmpInst::ICMP_SLT, clonedphi, len, "cmp85209");
+  cmp85209->insertAfter(clonedphi);
+
+  // Create conditional branch instruction
+  BranchInst *br = BranchInst::Create(ForBody, returnBB, cmp85209);
+
+  br->insertAfter(cmp85209);
+
+  // Get the terminator instruction of ClonedForBody
+  BranchInst *clonedTerminator =
+      dyn_cast<BranchInst>(ClonedForBody->getTerminator());
+  assert(clonedTerminator &&
+         "ClonedForBody's terminator should be a BranchInst");
+  if (!clonedTerminator) {
+    assert(false && "ClonedForBody has no terminator instruction\n");
+    return;
+  }
+
+  // Set the first operand of ClonedForBody's terminator to NewForCondPreheader2
+  clonedTerminator->setOperand(2, NewForCondPreheader2);
+
+  // Find the unique PHI node in ForBody
+  PHINode *uniquePHI2 = nullptr;
+  for (Instruction &I : *ForBody) {
+    if (auto *phi = dyn_cast<PHINode>(&I)) {
+      if (uniquePHI2) {
+        // If we've already found a PHINode but find another, it's not unique
+
+        uniquePHI = nullptr;
+        break;
+      }
+      uniquePHI2 = phi;
+    }
+  }
+
+  assert(uniquePHI2 && "No unique PHINode found in ForBody\n");
+
+  uniquePHI2->setIncomingValue(1, clonedphi);
+  uniquePHI2->setIncomingBlock(1, NewForCondPreheader2);
+
+  // Find the unique PHI node in returnBB
+  PHINode *returnBBPHI = nullptr;
+  for (Instruction &I : *returnBB) {
+    if (auto *phi = dyn_cast<PHINode>(&I)) {
+      if (returnBBPHI) {
+        // If we've already found a PHINode but find another, it's not unique
+        returnBBPHI = nullptr;
+        break;
+      }
+      returnBBPHI = phi;
+    }
+  }
+
+  if (returnBBPHI) {
+    // Add [0, NewForCondPreheader2]
+    returnBBPHI->addIncoming(ConstantInt::get(returnBBPHI->getType(), 0),
+                             NewForCondPreheader2);
+  } else {
+    assert(false && "No unique PHI node found in returnBB\n");
+  }
+}
+
+static void addnoalias(Function &F) {
+  for (Argument &Arg : F.args()) {
+    if (Arg.getType()->isPointerTy()) {
+      Arg.addAttr(Attribute::NoAlias);
+    }
+  }
+}
+static BasicBlock *cloneForBody(Function &F, BasicBlock *ForBody,
+                                const std::string &Suffix) {
+  ValueToValueMapTy VMap;
+  BasicBlock *ClonedForBody = CloneBasicBlock(ForBody, VMap, Suffix, &F);
+  VMap[ForBody] = ClonedForBody;
+  remapInstructionsInBlocks({ClonedForBody}, VMap);
+  return ClonedForBody;
+}
+
+static void unrollAddc(Function &F, ScalarEvolution &SE, Loop *L,
+                       int unroll_factor) {
+
+  // Get the basic block containing the function body from L
+  BasicBlock *ForBody = L->getHeader();
+
+  // Ensure that the basic block containing the function body is found
+  if (!ForBody) {
+    assert(ForBody && "ForBody not found");
+    return;
+  }
+
+  // clone for body
+
+  BasicBlock *ClonedForBody = cloneForBody(F, ForBody, ".modify");
+  ClonedForBody->moveBefore(ForBody);
+
+  Value *sub = unrolladdcClonedForBody(ClonedForBody, unroll_factor);
+
+  // Find the ForCondPreheader basic block from F
+  BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader");
+  assert(ForCondPreheader && "Expected to find for.cond.preheader!");
+  expandForCondPreheaderaddc(F, ForCondPreheader, ClonedForBody, ForBody, sub,
+                             unroll_factor);
+  modifyAddToOr(ClonedForBody);
+  groupAndReorderInstructions(ClonedForBody);
+
+  // Verify the function
+  if (verifyFunction(F, &errs())) {
+    LLVM_DEBUG(errs() << "Function verification failed\n");
+    return;
+  }
+}
+
+static void unrollCorr(Function &F, Loop *L, int unroll_factor) {
+
+  // Get the basic block containing the function body from L
+  BasicBlock *ForBody = L->getHeader();
+  assert(ForBody && "ForBody not found");
+
+  // clone for body
+  BasicBlock *ClonedForBody = cloneForBody(F, ForBody, ".unroll");
+
+  BasicBlock *returnBB = getBasicBlockByName(F, "return");
+  assert(returnBB && "Expected to find return!");
+  BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader");
+  assert(ForCondPreheader && "Expected to find for.cond.preheader!");
+  BasicBlock *ForCond11PreheaderUs = L->getLoopPreheader();
+  assert(ForCond11PreheaderUs && "Expected to find for.cond.preheader!");
+
+  ClonedForBody->moveBefore(returnBB);
+
+  ForCondPreheader->setName("if.end");
+
+  // Find the first instruction in ForCondPreheader
+  Instruction *FirstInst = &*ForCondPreheader->begin();
+  Instruction *SecondInst = FirstInst->getNextNode();
+  // Ensure the first instruction is a sub nsw instruction
+  if (BinaryOperator *SubInst = dyn_cast<BinaryOperator>(FirstInst)) {
+    if (SubInst->getOpcode() == Instruction::Sub &&
+        SubInst->hasNoSignedWrap()) {
+      ;
+    } else {
+      assert(false && "The first instruction in ForCondPreheader is not a sub "
+                      "nsw instruction\n");
+    }
+  } else {
+    assert(false && "The first instruction in ForCondPreheader is not a binary "
+                    "operation\n");
+  }
+  // Insert new instruction after FirstInst
+  IRBuilder<> Builder(FirstInst->getNextNode());
+  Value *Sub6 = Builder.CreateNSWAdd(
+      FirstInst, ConstantInt::get(FirstInst->getType(), 1 - unroll_factor),
+      "sub6");
+
+  if (ICmpInst *CmpInst = dyn_cast<ICmpInst>(SecondInst)) {
+    if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
+      CmpInst->setOperand(0, FirstInst);
+      CmpInst->setOperand(
+          1, ConstantInt::get(FirstInst->getType(), unroll_factor - 1));
+      CmpInst->setPredicate(ICmpInst::ICMP_SGT);
+    }
+  }
+  // Create new basic blocks
+  BasicBlock *ForCond11PreheaderPreheader = ForCondPreheader->getNextNode();
+  BasicBlock *ForCond8PreheaderLrPh =
+      BasicBlock::Create(F.getContext(), "for.cond8.preheader.lr.ph", &F,
+                         ForCond11PreheaderPreheader);
+  BasicBlock *ForCond8Preheader = BasicBlock::Create(
+      F.getContext(), "for.cond8.preheader", &F, ForCond11PreheaderPreheader);
+  BasicBlock *ForBody10LrPh = BasicBlock::Create(
+      F.getContext(), "for.body10.lr.ph", &F, ForCond11PreheaderPreheader);
+  BasicBlock *ForCond91Preheader = BasicBlock::Create(
+      F.getContext(), "for.cond91.preheader", &F, ForCond11PreheaderPreheader);
+  BasicBlock *ForCond95PreheaderLrPh =
+      BasicBlock::Create(F.getContext(), "for.cond95.preheader.lr.ph", &F,
+                         ForCond11PreheaderPreheader);
+
+  // Set predecessors for the basic blocks
+  ForCondPreheader->getTerminator()->setSuccessor(0, ForCond8PreheaderLrPh);
+  ForCondPreheader->getTerminator()->setSuccessor(1, ForCond91Preheader);
+
+  // Find the parameter named patlen from the function arguments
+  Value *PatlenArg = F.getArg(3);
+  Value *SignalArg = F.getArg(0);
+  assert(PatlenArg && "Parameter named patlen not found\n");
+  assert(SignalArg && "Parameter named signal not found\n");
+
+  // Add instructions to the for.cond8.preheader.lr.ph basic block
+  Builder.SetInsertPoint(ForCond8PreheaderLrPh);
+  Value *Cmp9242 = Builder.CreateICmpSGT(
+      PatlenArg, ConstantInt::get(PatlenArg->getType(), 0), "cmp9242");
+  Builder.CreateBr(ForCond8Preheader);
+
+  // Add instructions to the for.cond8.preheader basic block
+  Builder.SetInsertPoint(ForCond8Preheader);
+  PHINode *N0276 =
+      Builder.CreatePHI(Type::getInt32Ty(F.getContext()), 2, "n.0276");
+  N0276->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+                     ForCond8PreheaderLrPh);
+
+  // Create conditional branch instruction
+  Builder.CreateCondBr(Cmp9242, ForBody10LrPh, nullptr);
+
+  // Add instructions to the for.body10.lr.ph basic block
+  Builder.SetInsertPoint(ForBody10LrPh);
+
+  // Create getelementptr instruction
+  Value *GEP =
+      Builder.CreateGEP(Type::getFloatTy(F.getContext()), SignalArg, N0276, "");
+
+  // Create unconditional branch instruction to ClonedForBody
+  Builder.CreateBr(ClonedForBody);
+
+  // Add instructions to the for.cond91.preheader basic block
+  Builder.SetInsertPoint(ForCond91Preheader);
+
+  // Create PHI node
+  PHINode *N0Lcssa =
+      Builder.CreatePHI(Type::getInt32Ty(F.getContext()), 2, "n.0.lcssa");
+  N0Lcssa->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+                       ForCondPreheader);
+  // Note: [ %add89, %for.cond.cleanup ] part not added yet
+
+  // Create comparison instruction
+  Value *Cmp92Not282 =
+      Builder.CreateICmpSGT(N0Lcssa, FirstInst, "cmp92.not282");
+
+  // Create conditional branch instruction
+  Builder.CreateCondBr(Cmp92Not282, returnBB, ForCond95PreheaderLrPh);
+
+  // Add instructions to the for.cond95.preheader.lr.ph basic block
+  Builder.SetInsertPoint(ForCond95PreheaderLrPh);
+
+  Value *Cmp92678 = Builder.CreateICmpSGT(
+      PatlenArg, ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+      "Cmp92678");
+  // Insert Cmp92678
+  Builder.CreateCondBr(Cmp92678, ForCond11PreheaderUs,
+                       ForCond11PreheaderPreheader);
+
+  Builder.SetInsertPoint(ForCond11PreheaderPreheader,
+                         ForCond11PreheaderPreheader->begin());
+
+  Instruction *ForCond11PreheaderPreheaderterminater =
+      ForCond11PreheaderPreheader->getTerminator();
+  Instruction *ForCond11PreheaderPreheaderFirstInst =
+      &*ForCond11PreheaderPreheader->begin();
+  Value *SiglenArg = ForCond11PreheaderPreheaderFirstInst->getOperand(0);
+  // Calculate the result of n.0.lcssa left shifted by 2 bits
+  Value *ShiftedN = Builder.CreateShl(
+      N0Lcssa, ConstantInt::get(Type::getInt32Ty(F.getContext()), 2), "");
+
+  // Create getelementptr instruction
+  // Find memset function call
+  CallInst *MemsetCall = getFirstCallInstWithName(ForCond11PreheaderPreheader,
+                                                  "llvm.memset.p0.i32");
+
+  // Ensure memset call is found
+  assert(MemsetCall && "memset call not found");
+
+  // Get DestArg
+  Value *DestArg = MemsetCall->getArgOperand(0);
+
+  // Create new GEP instruction
+  Value *Scevgep = Builder.CreateGEP(Type::getInt8Ty(F.getContext()), DestArg,
+                                     ShiftedN, "scevgep");
+  MemsetCall->setOperand(0, Scevgep);
+  // Calculate siglen + 1
+  Value *SiglenPlus1 = Builder.CreateAdd(
+      SiglenArg, ConstantInt::get(Type::getInt32Ty(F.getContext()), 1), "");
+
+  // Calculate n.0.lcssa + patlen
+  Value *NplusPatlen = Builder.CreateAdd(N0Lcssa, PatlenArg, "");
+
+  // Calculate (siglen + 1) - (n.0.lcssa + patlen)
+  Value *SubResult = Builder.CreateSub(SiglenPlus1, NplusPatlen, "");
+
+  // Calculate the final memset length
+  Value *MemsetLen = Builder.CreateShl(
+      SubResult, ConstantInt::get(Type::getInt32Ty(F.getContext()), 2), "");
+  Instruction *addinst = dyn_cast<Instruction>(MemsetCall->getOperand(2));
+  MemsetCall->setOperand(2, MemsetLen);
+  if (addinst && addinst->use_empty())
+    addinst->eraseFromParent();
+  if (ForCond11PreheaderPreheaderFirstInst->use_empty())
+    ForCond11PreheaderPreheaderFirstInst->eraseFromParent();
+
+  // Create a Preheader for ForCond11PreheaderUs
+  BasicBlock *ForCond11PreheaderUsPreheader =
+      BasicBlock::Create(F.getContext(), "for.cond11.preheader.us.preheader",
+                         &F, ForCond11PreheaderUs);
+
+  // Add an unconditional branch to ForCond11PreheaderUs in the new Preheader
+  BranchInst::Create(ForCond11PreheaderUs, ForCond11PreheaderUsPreheader);
+
+  // Insert new instructions in ForCond11PreheaderUsPreheader
+  Builder.SetInsertPoint(ForCond11PreheaderUsPreheader->getTerminator());
+
+  // Add %6 = add i32 %siglen, 1
+  Value *SiglenPlus2 = Builder.CreateAdd(
+      SiglenArg, ConstantInt::get(Type::getInt32Ty(F.getContext()), 1), "");
+
+  // Add %7 = sub i32 %6, %patlen
+  Value *SubResult2 = Builder.CreateSub(SiglenPlus2, PatlenArg, "");
+
+  // Find PHI node
+  PHINode *PhiNode = nullptr;
+  for (PHINode &Phi : ForCond11PreheaderUs->phis()) {
+    PhiNode = &Phi;
+    break;
+  }
+
+  assert(PhiNode && "PHI node not found in for.cond11.preheader.us\n");
+
+  // Modify incoming values of the PHI node
+  PhiNode->setIncomingBlock(1, ForCond11PreheaderUsPreheader);
+  PhiNode->setIncomingValue(1, N0Lcssa);
+
+  BasicBlock *ForCond11ForCondCleanup13CritEdgeUs = ForBody->getNextNode();
+  // Find icmp ult instruction in ForCond11ForCondCleanup13CritEdgeUs
+  ICmpInst *IcmpUltInst = getLastICmpInstWithPredicate(
+      ForCond11ForCondCleanup13CritEdgeUs, ICmpInst::ICMP_ULT);
+
+  assert(IcmpUltInst && "icmp ult instruction not found in "
+                        "ForCond11ForCondCleanup13CritEdgeUs\n");
+
+  IcmpUltInst->setOperand(0, PhiNode->getIncomingValue(0));
+  IcmpUltInst->setOperand(1, SubResult2);
+  IcmpUltInst->setPredicate(ICmpInst::ICMP_EQ);
+
+  swapTerminatorSuccessors(ForCond11ForCondCleanup13CritEdgeUs);
+
+  // Find PHI nodes in ClonedForBody
+  for (PHINode &Phi : ClonedForBody->phis()) {
+    Phi.setIncomingBlock(0, ForBody10LrPh);
+  }
+
+  // Find phi float instruction in ClonedForBody
+  PHINode *FloatPhi = getFirstFloatPhi(ClonedForBody);
+  assert(FloatPhi && "phi float node not found");
+  // Find getelementptr inbounds instructions in ClonedForBody
+  GetElementPtrInst *GEPInst = nullptr;
+  GetElementPtrInst *GEPInst2 = nullptr;
+  for (auto &I : *ClonedForBody) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      if (GEP->isInBounds()) {
+        GEPInst = GEP;
+      } else {
+        GEPInst2 = GEP;
+      }
+    }
+  }
+  assert(GEPInst &&
+         "getelementptr inbounds instruction not found in ClonedForBody\n");
+  assert(GEPInst2 &&
+         "getelementptr inbounds instruction not found in ClonedForBody\n");
+
+  GEPInst2->setOperand(0, GEP);
+
+  Instruction *loadinst = GEPInst->getNextNode();
+  GEPInst->moveBefore(FloatPhi);
+  loadinst->moveBefore(FloatPhi);
+
+  if (FloatPhi) {
+    // Find the llvm.fmuladd.f32 instruction
+    Instruction *FMulAdd =
+        getFirstCallInstWithName(ClonedForBody, "llvm.fmuladd.f32");
+    assert(FMulAdd && "llvm.fmuladd.f32 instruction not found\n");
+    Instruction *InsertPoint = FMulAdd->getNextNode();
+    if (FMulAdd) {
+      // Copy instructions unroll_factor-1 times
+      for (int i = 0; i < (unroll_factor - 1); ++i) {
+        ValueToValueMapTy VMap;
+        for (auto It = FloatPhi->getIterator(); &*It != FMulAdd->getNextNode();
+             ++It) {
+          Instruction *NewInst = It->clone();
+          VMap[&*It] = NewInst;
+          NewInst->insertBefore(InsertPoint);
+        }
+
+        // Update operands of new instructions
+        for (auto It = FloatPhi->getIterator(); &*It != FMulAdd->getNextNode();
+             ++It) {
+          Instruction *NewInst = cast<Instruction>(VMap[&*It]);
+          for (unsigned j = 0; j < NewInst->getNumOperands(); j++) {
+            Value *Op = NewInst->getOperand(j);
+            if (VMap.count(Op)) {
+              NewInst->setOperand(j, VMap[Op]);
+            }
+          }
+          // If NewInst is a getelementptr instruction, set its operand 1 to i+1
+          if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(NewInst)) {
+            GEP->setOperand(0, GEPInst);
+            GEP->setOperand(
+                1, ConstantInt::get(GEP->getOperand(1)->getType(), i + 1));
+            GEP->setName("arrayidx" + std::to_string(i + 1));
+          }
+        }
+      }
+
+    } else {
+      assert(false && "llvm.fmuladd.f32 instruction not found\n");
+    }
+  } else {
+    assert(false && "phi float instruction not found\n");
+  }
+  movePHINodesToTop(*ClonedForBody);
+  groupAndReorderInstructions(ClonedForBody);
+
+  // Create new basic block for.cond.cleanup
+  BasicBlock *ForCondCleanup =
+      BasicBlock::Create(F.getContext(), "for.cond.cleanup", &F, ClonedForBody);
+
+  ForCond8Preheader->getTerminator()->setSuccessor(1, ForCondCleanup);
+  // Create unconditional branch to ClonedForBody in for.cond.cleanup
+  BranchInst::Create(ClonedForBody, ForCondCleanup);
+
+  // Get the terminator instruction of ClonedForBody
+  Instruction *Terminator = ClonedForBody->getTerminator();
+
+  // Set the first successor of ClonedForBody to for.cond.cleanup
+  if (Terminator->getNumSuccessors() > 0) {
+    Terminator->setSuccessor(0, ForCondCleanup);
+  }
+
+  // Clone phi float nodes from ClonedForBody to ForCondCleanup
+  int i = 0;
+  for (PHINode &Phi : ClonedForBody->phis()) {
+    if (Phi.getType()->isFloatTy()) {
+      Instruction *newPhi = Phi.clone();
+      cast<PHINode>(newPhi)->setIncomingBlock(0, ForCond8Preheader);
+      newPhi->insertBefore(ForCondCleanup->getTerminator());
+      if (i == 0) {
+        GetElementPtrInst *arrayidx = GetElementPtrInst::Create(
+            Type::getFloatTy(F.getContext()), DestArg, N0276, "arrayidx",
+            ForCondCleanup->getTerminator());
+        StoreInst *storeInst =
+            new StoreInst(newPhi, arrayidx, ForCondCleanup->getTerminator());
+      } else {
+        Instruction *orInst = BinaryOperator::CreateDisjoint(
+            Instruction::Or, N0276, ConstantInt::get(N0276->getType(), i),
+            "add");
+        orInst->insertBefore(ForCondCleanup->getTerminator());
+        GetElementPtrInst *arrayidx = GetElementPtrInst::Create(
+            Type::getFloatTy(F.getContext()), DestArg, orInst, "arrayidx",
+            ForCondCleanup->getTerminator());
+
+        StoreInst *storeInst =
+            new StoreInst(newPhi, arrayidx, ForCondCleanup->getTerminator());
+      }
+      i++;
+    }
+  }
+
+  // Insert new instructions at the end of ClonedForBody
+  Builder.SetInsertPoint(ForCondCleanup->getTerminator());
+  Value *add89 = Builder.CreateAdd(
+      N0276, ConstantInt::get(N0276->getType(), unroll_factor), "add89", true,
+      true);
+  Value *cmp7 = Builder.CreateICmpSLT(add89, Sub6, "cmp7");
+
+  // Get the original terminator instruction
+  Instruction *OldTerminator = ForCondCleanup->getTerminator();
+
+  // Create new conditional branch instruction
+  BranchInst *NewBr =
+      BranchInst::Create(ForCond8Preheader, ForCond91Preheader, cmp7);
+
+  // Insert new branch instruction and delete the old terminator
+  ReplaceInstWithInst(OldTerminator, NewBr);
+
+  movePHINodesToTop(*ForCondCleanup);
+  groupAndReorderInstructions(ForCondCleanup);
+
+  // Update PHI nodes in for.cond8.preheader
+  for (PHINode &Phi : ForCond8Preheader->phis()) {
+    Phi.addIncoming(add89, ForCondCleanup);
+  }
+
+  // Update PHI nodes in for.cond91.preheader
+  for (PHINode &Phi : ForCond91Preheader->phis()) {
+    Phi.addIncoming(add89, ForCondCleanup);
+  }
+
+  // Iterate through all PHI nodes in returnBB
+  for (PHINode &Phi : returnBB->phis()) {
+    // Add new incoming value for each PHI node
+    Phi.addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+                    ForCond91Preheader);
+  }
+  // for.cond95.preheader.lr.ph -> for.cond11.preheader.us.preheader
+  ForCond95PreheaderLrPh->getTerminator()->setSuccessor(
+      0, ForCond11PreheaderUsPreheader);
+}
+
+static bool checkIfDotProdSimplest(Function &F) {
+  bool flag = false;
+
+  if (F.size() == 3) {
+    BasicBlock *entryBB = getBasicBlockByName(F, "entry");
+    BasicBlock *forCondCleanup = getBasicBlockByName(F, "for.cond.cleanup");
+    BasicBlock *forBody = getBasicBlockByName(F, "for.body");
+    if (entryBB && forCondCleanup && forBody) {
+      CallInst *fmuladd = getFirstCallInstWithName(forBody, "llvm.fmuladd.f32");
+      if (fmuladd) {
+        if (forBody->getTerminator()->getSuccessor(0) == forCondCleanup &&
+            forBody->getTerminator()->getSuccessor(1) == forBody) {
+          if (entryBB->getTerminator()->getSuccessor(0) == forBody) {
+            flag = true;
+          }
+        }
+      }
+    }
+  }
+  return flag;
+}
+// for dotprod, llvm.fmuladd.f32 is in for.body
+static bool checkIfDotProdComplicated(Function &F) {
+  bool flag1 = false;
+  bool flag2 = false;
+  bool flag3 = false;
+  if (F.size() == 3) {
+    BasicBlock *entryBB = getBasicBlockByName(F, "entry");
+    BasicBlock *forCondCleanup = getBasicBlockByName(F, "for.cond.cleanup");
+    BasicBlock *forBody = getBasicBlockByName(F, "for.body");
+    if (entryBB && forCondCleanup && forBody) {
+      CallInst *fmuladd = getFirstCallInstWithName(forBody, "llvm.fmuladd.f32");
+      if (fmuladd) {
+
+        if (forBody->getTerminator()->getSuccessor(0) == forCondCleanup &&
+            forBody->getTerminator()->getSuccessor(1) == forBody) {
+          if (entryBB->getTerminator()->getSuccessor(0) == forBody) {
+            flag1 = true;
+          }
+        }
+      }
+    }
+    if (forBody) {
+      for (Instruction &I : *forBody) {
+        if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+          if (BinOp->getOpcode() == Instruction::FAdd ||
+              BinOp->getOpcode() == Instruction::FMul ||
+              BinOp->getOpcode() == Instruction::FSub ||
+              BinOp->getOpcode() == Instruction::FDiv) {
+            flag2 = true;
+          }
+        }
+      }
+
+      // Check if forBody has exactly one float PHI node
+      int floatPhiCount = 0;
+      for (PHINode &Phi : forBody->phis()) {
+        if (Phi.getType()->isFloatTy()) {
+          floatPhiCount++;
+        }
+      }
+      if (floatPhiCount == 1) {
+        flag3 = true;
+      }
+    }
+  }
+
+  return flag1 && flag2 && flag3;
+}
+static bool shouldUnrollLoopWithCount(Function &F, Loop *L,
+                                      ScalarEvolution &SE) {
+  if (!checkIfDotProdSimplest(F)) {
+    return false;
+  }
+  // Check if the loop is suitable for unrolling
+  if (!L->getLoopLatch())
+    return false;
+  if (!L->getExitingBlock())
+    return false;
+
+  // Check if the loop count is fixed and appropriate, loop count is constant
+  const SCEV *TripCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVConstant>(TripCount)) {
+    // More condition checks can be added here
+    return true;
+  }
+  return false;
+}
+
+static void
+insertPhiNodesForFMulAdd(BasicBlock *LoopHeader, BasicBlock *LoopPreheader,
+                         SmallVector<CallInst *, 16> &FMulAddCalls) {
+  // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader
+  for (Instruction &I : *LoopHeader) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (Function *F = CI->getCalledFunction()) {
+        if (F->getName() == "llvm.fmuladd.f32" && CI->isTailCall()) {
+          FMulAddCalls.push_back(CI);
+        }
+      }
+    }
+  }
+
+  // Insert phi nodes for each FMulAdd call
+  for (CallInst *CI : FMulAddCalls) {
+    // Create new phi node
+    PHINode *PHI =
+        PHINode::Create(CI->getType(), 2, CI->getName() + ".phi", CI);
+
+    // Set incoming values for phi node
+    PHI->addIncoming(ConstantFP::get(CI->getType(), 0), LoopPreheader);
+    PHI->addIncoming(CI, LoopHeader);
+
+    CI->setOperand(2, PHI);
+  }
+}
+
+static void postUnrollLoopWithCount(Function &F, Loop *L, int unroll_count) {
+  BasicBlock *LoopHeader = L->getHeader();
+  BasicBlock *LoopPreheader = L->getLoopPreheader();
+  // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader
+  SmallVector<CallInst *, 16> FMulAddCalls;
+  insertPhiNodesForFMulAdd(LoopHeader, LoopPreheader, FMulAddCalls);
+
+  movePHINodesToTop(*LoopHeader);
+  modifyAddToOr(LoopHeader);
+  groupAndReorderInstructions(LoopHeader);
+
+  // Create for.end basic block after LoopHeader
+  ICmpInst *LastICmp = getLastICmpInst(LoopHeader);
+  LastICmp->setPredicate(ICmpInst::ICMP_ULT);
+  // Get the first operand of LastICmp
+  Value *Operand1 = LastICmp->getOperand(1);
+
+  // Directly set the first operand of LastICmp to a new constant value
+  LastICmp->setOperand(
+      1, ConstantInt::get(Operand1->getType(),
+                          dyn_cast<ConstantInt>(Operand1)->getSExtValue() -
+                              (2 * unroll_count - 1)));
+  LastICmp->setName("cmp");
+
+  swapTerminatorSuccessors(LoopHeader);
+
+  // After swapping, succ 0 is LoopHeader, succ 1 is returnBB
+  BasicBlock *ExitingBlock = L->getExitBlock();
+  ExitingBlock->setName("for.end");
+
+  // Get ret instruction in ExitingBlock
+  ReturnInst *RetInst = dyn_cast<ReturnInst>(ExitingBlock->getTerminator());
+  if (!RetInst) {
+    assert(false && "ret instruction not found\n");
+    return;
+  }
+
+  // Get the original return value
+  Value *OriginalRetValue = RetInst->getOperand(0);
+
+  // Create IRBuilder, set insertion point before ret instruction
+  IRBuilder<> Builder(RetInst);
+
+  // Create a series of fadd instructions
+  Value *CurrentSum = OriginalRetValue;
+  Value *add37 = Builder.CreateFAdd(FMulAddCalls[1], CurrentSum, "add37");
+  Value *add38 = Builder.CreateFAdd(FMulAddCalls[2], FMulAddCalls[3], "add38");
+  Value *add39 = Builder.CreateFAdd(FMulAddCalls[4], FMulAddCalls[5], "add39");
+  Value *add40 = Builder.CreateFAdd(FMulAddCalls[6], FMulAddCalls[7], "add40");
+  Value *add41 = Builder.CreateFAdd(add37, add38, "add41");
+  Value *add42 = Builder.CreateFAdd(add39, add40, "add42");
+  CurrentSum = Builder.CreateFAdd(add41, add42, "add43");
+
+  // Replace the original ret instruction
+  RetInst->setOperand(0, CurrentSum);
+
+  // Verify function
+  if (verifyFunction(F, &errs())) {
+    LLVM_DEBUG(errs() << "Function verification failed\n");
+    return;
+  }
+}
+
+static bool shouldUnrollComplexLoop(Function &F, Loop *L, ScalarEvolution &SE,
+                                    DominatorTree &DT, LoopInfo &LI) {
+  if (!checkIfDotProdComplicated(F)) {
+    return false;
+  }
+  // Check if the loop is suitable for unrolling
+  if (!L->getLoopLatch())
+    return false;
+  if (!L->getExitingBlock())
+    return false;
+
+  if (L->getCanonicalInductionVariable())
+    return false;
+  // Check if the loop count is fixed and appropriate, loop count is constant
+  BasicBlock *LoopPreheader = L->getLoopPreheader();
+  // Get the start value of the loop
+  if (LoopPreheader) {
+    return false;
+  }
+
+  BasicBlock *LoopHeader = L->getHeader();
+  BasicBlock *NewPreheader =
+      BasicBlock::Create(LoopHeader->getContext(), "for.cond.preheader",
+                         LoopHeader->getParent(), LoopHeader);
+  // Redirect all external predecessors to the new preheader basic block
+  for (BasicBlock *pred : predecessors(LoopHeader)) {
+    if (!L->contains(pred)) {
+      pred->getTerminator()->replaceUsesOfWith(LoopHeader, NewPreheader);
+      // Update PHI nodes in the loop header to point to the new preheader basic
+      // block
+      for (PHINode &PN : LoopHeader->phis()) {
+        int Index = PN.getBasicBlockIndex(pred);
+        if (Index != -1) {
+          PN.setIncomingBlock(Index, NewPreheader);
+        }
+      }
+    }
+  }
+  // Jump from the new preheader to the loop header
+  BranchInst::Create(LoopHeader, NewPreheader);
+  return true;
+}
+
+static bool shouldUnrollAddcType(Function &F, LoopInfo *LI) {
+  // Check the number of basic blocks
+  if (F.size() != 6)
+    return false;
+
+  // Check the loop nesting level
+  unsigned int maxLoopDepth = 0;
+  for (auto &BB : F) {
+    maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB));
+  }
+  if (maxLoopDepth != 1) {
+    return false;
+  }
+
+  BasicBlock *Entry = getBasicBlockByName(F, "entry");
+  BasicBlock *IfEnd = getBasicBlockByName(F, "if.end");
+  BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader");
+  BasicBlock *ForBody = getBasicBlockByName(F, "for.body");
+  BasicBlock *ForBodyClone = getBasicBlockByName(F, "for.body.clone");
+  BasicBlock *Return = getBasicBlockByName(F, "return");
+
+  if (!Entry || !IfEnd || !ForCondPreheader || !ForBody || !ForBodyClone ||
+      !Return)
+    return false;
+
+  if (Entry->getTerminator()->getSuccessor(0) != Return ||
+      Entry->getTerminator()->getSuccessor(1) != IfEnd ||
+      IfEnd->getTerminator()->getSuccessor(0) != ForBody ||
+      IfEnd->getTerminator()->getSuccessor(1) != ForCondPreheader ||
+      ForCondPreheader->getTerminator()->getSuccessor(0) != ForBodyClone ||
+      ForCondPreheader->getTerminator()->getSuccessor(1) != Return ||
+      ForBody->getTerminator()->getSuccessor(0) != Return ||
+      ForBody->getTerminator()->getSuccessor(1) != ForBody ||
+      ForBodyClone->getTerminator()->getSuccessor(0) != Return ||
+      ForBodyClone->getTerminator()->getSuccessor(1) != ForBodyClone)
+    return false;
+
+  // Check if there are three outer loops, each with one inner loop
+  int outerLoopCount = 0;
+  int innerLoopCount = 0;
+  for (Loop *L : LI->getLoopsInPreorder()) {
+    if (L->getLoopDepth() == 1) {
+      outerLoopCount++;
+      if (L->getSubLoops().size() == 1) {
+        innerLoopCount++;
+      }
+    }
+  }
+
+  if (outerLoopCount != 2 || innerLoopCount != 0) {
+    return false;
+  }
+
+  return true;
+}
+
+static bool shouldUnrollDotprodType(Function &F, LoopInfo *LI) {
+  // Check the number of basic blocks
+  if (F.size() != 5)
+    return false;
+
+  // Check the loop nesting level
+  unsigned int maxLoopDepth = 0;
+  for (auto &BB : F) {
+    maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB));
+  }
+  if (maxLoopDepth != 1) {
+    return false;
+  }
+
+  BasicBlock *Entry = getBasicBlockByName(F, "entry");
+  BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader");
+  BasicBlock *IfEnd = getBasicBlockByName(F, "if.end");
+  BasicBlock *ForBody = getBasicBlockByName(F, "for.body");
+  BasicBlock *ForBodyClone = getBasicBlockByName(F, "for.body.clone");
+
+  if (!Entry || !IfEnd || !ForCondPreheader || !ForBody || !ForBodyClone)
+    return false;
+
+  if (Entry->getTerminator()->getSuccessor(0) != ForBody ||
+      Entry->getTerminator()->getSuccessor(1) != ForCondPreheader ||
+      ForCondPreheader->getTerminator()->getSuccessor(0) != ForBodyClone ||
+      ForCondPreheader->getTerminator()->getSuccessor(1) != IfEnd ||
+      ForBody->getTerminator()->getSuccessor(0) != IfEnd ||
+      ForBody->getTerminator()->getSuccessor(1) != ForBody ||
+      ForBodyClone->getTerminator()->getSuccessor(0) != IfEnd ||
+      ForBodyClone->getTerminator()->getSuccessor(1) != ForBodyClone)
+    return false;
+
+  // Check if there are three outer loops, each with one inner loop
+  int outerLoopCount = 0;
+  int innerLoopCount = 0;
+  for (Loop *L : LI->getLoopsInPreorder()) {
+    if (L->getLoopDepth() == 1) {
+      outerLoopCount++;
+      if (L->getSubLoops().size() == 1) {
+        innerLoopCount++;
+      }
+    }
+  }
+
+  if (outerLoopCount != 2 || innerLoopCount != 0) {
+    return false;
+  }
+
+  return true;
+}
+
+static std::pair<Value *, Value *> modifyEntryBB(BasicBlock &entryBB) {
+  ICmpInst *icmp = getLastICmpInst(&entryBB);
+  assert(icmp && "icmp not found");
+  Value *start_index = icmp->getOperand(0);
+  Value *end_index = icmp->getOperand(1);
+  // Insert new instructions before icmp
+  IRBuilder<> Builder(icmp);
+  Value *sub = Builder.CreateNSWAdd(
+      end_index, ConstantInt::get(end_index->getType(), -8), "sub");
+  icmp->setOperand(0, sub);
+  icmp->setOperand(1, start_index);
+  return std::make_pair(sub, end_index);
+}
+
+static void postUnrollLoopWithVariable(Function &F, Loop *L, int unroll_count) {
+  BasicBlock *LoopPreheader = L->getLoopPreheader();
+  // Get the basic blocks to merge
+  SmallVector<BasicBlock *> BBsToMerge;
+  BasicBlock *ForBody1 = getBasicBlockByName(F, "for.body.1");
+  BasicBlock *ForBody2 = getBasicBlockByName(F, "for.body.2");
+  BasicBlock *ForBody3 = getBasicBlockByName(F, "for.body.3");
+  BasicBlock *ForBody4 = getBasicBlockByName(F, "for.body.4");
+  BasicBlock *ForBody5 = getBasicBlockByName(F, "for.body.5");
+  BasicBlock *ForBody6 = getBasicBlockByName(F, "for.body.6");
+  BasicBlock *ForBody7 = getBasicBlockByName(F, "for.body.7");
+  assert(ForBody1 && ForBody2 && ForBody3 && ForBody4 && ForBody5 && ForBody6 &&
+         ForBody7 && "basic block not found");
+  BBsToMerge.push_back(ForBody1);
+  BBsToMerge.push_back(ForBody2);
+  BBsToMerge.push_back(ForBody3);
+  BBsToMerge.push_back(ForBody4);
+  BBsToMerge.push_back(ForBody5);
+  BBsToMerge.push_back(ForBody6);
+  BBsToMerge.push_back(ForBody7);
+
+  BasicBlock *LoopHeader = L->getHeader();
+  BasicBlock *LoopHeaderClone =
+      cloneBasicBlockWithRelations(LoopHeader, ".clone", &F);
+  LoopHeaderClone->moveAfter(LoopHeader);
+  // Create a new basic block as for.end
+  BasicBlock *ForEnd = getBasicBlockByName(F, "for.cond.cleanup");
+  assert(ForEnd && "basic block not found");
+  ForEnd->setName("for.end");
+
+  LoopHeaderClone->getTerminator()->setSuccessor(1, LoopHeaderClone);
+  for (PHINode &Phi : LoopHeaderClone->phis()) {
+    Phi.setIncomingBlock(1, LoopHeaderClone);
+  }
+
+  for (BasicBlock *BB : BBsToMerge) {
+    MergeBasicBlockIntoOnlyPred(BB);
+  }
+
+  // Adjust positions
+  LoopHeaderClone->moveAfter(getBasicBlockByName(F, "for.body.7"));
+  assert(LoopHeaderClone && "basic block not found");
+  ForEnd->moveAfter(LoopHeaderClone);
+
+  BasicBlock &entryBB = F.getEntryBlock();
+  auto [Sub, end_index] = modifyEntryBB(entryBB);
+  entryBB.getTerminator()->setSuccessor(1, ForBody7);
+
+  SmallVector<Instruction *> FAMSDInsts;
+  for (Instruction &I : *ForBody7) {
+    if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+      if (BinOp->getOpcode() == Instruction::FAdd ||
+          BinOp->getOpcode() == Instruction::FMul ||
+          BinOp->getOpcode() == Instruction::FSub ||
+          BinOp->getOpcode() == Instruction::FDiv) {
+        FAMSDInsts.push_back(BinOp);
+      }
+    }
+  }
+  assert(!FAMSDInsts.empty() && "fadd/fmul/fsub/fdiv instruction not found");
+  PHINode *firstFloatPhi = getFirstFloatPhi(ForBody7);
+  assert(firstFloatPhi && "phi node not found");
+  // Clone phi node 7 times
+  for (int i = 0; i < 7; i++) {
+    PHINode *clonedPhi = cast<PHINode>(firstFloatPhi->clone());
+    clonedPhi->setName("result" + Twine(i));
+    clonedPhi->insertAfter(firstFloatPhi);
+    auto *temp = FAMSDInsts[i];
+    clonedPhi->setIncomingValue(1, temp);
+    temp->setOperand(0, clonedPhi);
+  }
+
+  for (PHINode &Phi : ForBody7->phis()) {
+    Phi.setIncomingBlock(0, &entryBB);
+    auto *temp = Phi.clone();
+    temp->setName("result0.0.lcssa");
+    temp->insertBefore(LoopPreheader->getTerminator());
+  }
+
+  ICmpInst *lastICmp = getLastICmpInst(ForBody7);
+  assert(lastICmp && "icmp not found");
+  lastICmp->setOperand(1, Sub);
+  lastICmp->setPredicate(ICmpInst::ICMP_SLT);
+
+  ForBody7->getTerminator()->setSuccessor(0, LoopPreheader);
+  ForBody7->getTerminator()->setSuccessor(1, ForBody7);
+
+  PHINode *firstI32Phi = getFirstI32Phi(LoopPreheader);
+  assert(firstI32Phi && "phi node not found");
+  // Insert icmp slt instruction in LoopPreheader
+  IRBuilder<> Builder(LoopPreheader->getTerminator());
+  ICmpInst *NewICmp =
+      cast<ICmpInst>(Builder.CreateICmpSLT(firstI32Phi, end_index, "cmp"));
+
+  // Convert the original unconditional branch to a conditional branch
+  BranchInst *OldBr = cast<BranchInst>(LoopPreheader->getTerminator());
+  BranchInst *NewBr = BranchInst::Create(LoopHeaderClone, ForEnd, NewICmp);
+  ReplaceInstWithInst(OldBr, NewBr);
+
+  Instruction *faddInst = nullptr;
+  Instruction *addNswInst = nullptr;
+
+  for (auto &I : *LoopHeaderClone) {
+    if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+      if ((BinOp->getOpcode() == Instruction::FAdd ||
+           BinOp->getOpcode() == Instruction::FMul ||
+           BinOp->getOpcode() == Instruction::FSub ||
+           BinOp->getOpcode() == Instruction::FDiv) &&
+          BinOp->getType()->isFloatTy()) {
+        faddInst = BinOp;
+      } else if (BinOp->getOpcode() == Instruction::Add &&
+                 BinOp->hasNoSignedWrap()) {
+        addNswInst = BinOp;
+      }
+    }
+
+    if (faddInst && addNswInst) {
+      break;
+    }
+  }
+  assert(faddInst && addNswInst &&
+         "fadd/fmul/fsub/fdiv float and add nsw instructions not found");
+  PHINode *firstI32PhiLoopHeaderClone = getFirstI32Phi(LoopHeaderClone);
+  assert(firstI32PhiLoopHeaderClone && "phi node not found");
+  firstI32PhiLoopHeaderClone->setIncomingValue(0, firstI32Phi);
+  firstI32PhiLoopHeaderClone->setIncomingValue(1, addNswInst);
+
+  PHINode *firstFloatPhiLoopHeaderClone = getFirstFloatPhi(LoopHeaderClone);
+  assert(firstFloatPhiLoopHeaderClone && "phi node not found");
+  PHINode *lastFloatPhiLoopPreheader = getLastFloatPhi(LoopPreheader);
+  assert(lastFloatPhiLoopPreheader && "phi node not found");
+  firstFloatPhiLoopHeaderClone->setIncomingValue(0, lastFloatPhiLoopPreheader);
+  firstFloatPhiLoopHeaderClone->setIncomingValue(1, faddInst);
+
+  // Collect all phi float instructions in LoopPreheader
+  SmallVector<PHINode *> floatPhis;
+  for (auto &I : *LoopPreheader) {
+    if (auto *Phi = dyn_cast<PHINode>(&I)) {
+      if (Phi->getType()->isFloatTy()) {
+        floatPhis.push_back(Phi);
+      }
+    }
+  }
+
+  // Get the ret instruction in ExitingBlock
+  ReturnInst *RetInst = dyn_cast<ReturnInst>(ForEnd->getTerminator());
+  if (!RetInst) {
+    assert(false && "ret instruction not found in ExitingBlock");
+    return;
+  }
+
+  // Get the original return value
+  Value *OriginalRetValue = RetInst->getOperand(0);
+
+  // Create IRBuilder, set insertion point before the ret instruction
+
+  Builder.SetInsertPoint(RetInst);
+  // Create a series of fadd instructions
+  assert(floatPhis.size() == 8 && "expected floatPhis has 8 phi node");
+  Value *CurrentSum = nullptr;
+  Value *add64 = Builder.CreateFAdd(floatPhis[0], OriginalRetValue, "add64");
+  Value *add65 = Builder.CreateFAdd(floatPhis[1], floatPhis[2], "add65");
+  Value *add66 = Builder.CreateFAdd(floatPhis[3], floatPhis[4], "add66");
+  Value *add67 = Builder.CreateFAdd(floatPhis[5], floatPhis[6], "add67");
+  Value *add68 = Builder.CreateFAdd(add64, add65, "add68");
+  Value *add69 = Builder.CreateFAdd(add66, add67, "add69");
+  CurrentSum = Builder.CreateFAdd(add68, add69, "add70");
+
+  // Replace the original ret instruction
+  RetInst->setOperand(0, CurrentSum);
+  PHINode *firstFloatPhiForEnd = getFirstFloatPhi(ForEnd);
+  assert(firstFloatPhiForEnd && "phi node not found");
+  // Remove existing incoming values from firstFloatPhiForEnd
+  while (firstFloatPhiForEnd->getNumIncomingValues() > 0) {
+    firstFloatPhiForEnd->removeIncomingValue(0u, false);
+  }
+  // Add two incoming values to firstFloatPhiForEnd
+  firstFloatPhiForEnd->addIncoming(faddInst, LoopHeaderClone);
+  firstFloatPhiForEnd->addIncoming(lastFloatPhiLoopPreheader, LoopPreheader);
+
+  runDeadCodeElimination(F);
+}
+
+static bool shouldUnrollCorr(Function &F, LoopInfo *LI) {
+  if (F.size() != 7)
+    return false;
+
+  BasicBlock *Entry = getBasicBlockByName(F, "entry");
+  BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader");
+  BasicBlock *Return = getBasicBlockByName(F, "return");
+
+  if (!Entry || !ForCondPreheader || !Return)
+    return false;
+
+  if (Entry->getTerminator()->getSuccessor(0) != Return ||
+      Entry->getTerminator()->getSuccessor(1) != ForCondPreheader) {
+    return false;
+  }
+
+  // Feature 2: Has 5 parameters
+  if (F.arg_size() != 5) {
+    return false;
+  }
+
+  unsigned int loopNestLevel = 0;
+  for (auto &BB : F) {
+    if (isa<BranchInst>(BB.getTerminator())) {
+      loopNestLevel = std::max(loopNestLevel, LI->getLoopDepth(&BB));
+    }
+  }
+  if (loopNestLevel != 2) {
+    return false;
+  }
+
+  bool hasFMulAdd = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) {
+        hasFMulAdd = true;
+        break;
+      }
+    }
+    if (hasFMulAdd)
+      break;
+  }
+  if (!hasFMulAdd) {
+    return false;
+  }
+
+  return true;
+}
+
+static bool shouldUnrollConvccorr(Function &F, LoopInfo *LI) {
+  // Check the number of basic blocks
+  if (F.size() != 17)
+    return false;
+
+  // Check the number of parameters
+  if (F.arg_size() != 5) {
+    return false;
+  }
+
+  // Check the loop nesting level
+  unsigned int maxLoopDepth = 0;
+  for (auto &BB : F) {
+    maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB));
+  }
+  if (maxLoopDepth != 2) {
+    return false;
+  }
+
+  // Check if the fmuladd.f32 inline function is used
+  bool hasFMulAdd = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) {
+        hasFMulAdd = true;
+        break;
+      }
+    }
+    if (hasFMulAdd)
+      break;
+  }
+  if (!hasFMulAdd) {
+    return false;
+  }
+
+  BasicBlock *Entry = getBasicBlockByName(F, "entry");
+  BasicBlock *ForBody = getBasicBlockByName(F, "for.body");
+  BasicBlock *ForEnd = getBasicBlockByName(F, "for.end");
+  BasicBlock *Return = getBasicBlockByName(F, "return");
+
+  if (!Entry || !ForBody || !ForEnd || !Return)
+    return false;
+
+  if (Entry->getTerminator()->getSuccessor(0) != Return ||
+      ForEnd->getTerminator()->getSuccessor(1) != ForBody)
+    return false;
+
+  // Check if there are three outer loops, each with one inner loop
+  int outerLoopCount = 0;
+  int innerLoopCount = 0;
+  for (Loop *L : LI->getLoopsInPreorder()) {
+    if (L->getLoopDepth() == 1) {
+      outerLoopCount++;
+      if (L->getSubLoops().size() == 1) {
+        innerLoopCount++;
+      }
+    }
+  }
+
+  if (outerLoopCount != 3 || innerLoopCount != 3) {
+    return false;
+  }
+
+  // Check if there are three icmp eq instructions in the entry basic block
+  int icmpEqCount = 0;
+  for (auto &I : *Entry) {
+    if (auto *ICmp = dyn_cast<ICmpInst>(&I)) {
+      if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+        icmpEqCount++;
+      }
+    }
+  }
+
+  if (icmpEqCount != 3) {
+    return false;
+  }
+
+  return true;
+}
+
+static bool shouldUnrollFird(Function &F, LoopInfo *LI) {
+
+  // Check the number of basic blocks
+  if (F.size() != 14)
+    return false;
+
+  // Check the number of parameters
+  if (F.arg_size() != 4) {
+    return false;
+  }
+
+  // Check the loop nesting level
+  unsigned int maxLoopDepth = 0;
+  for (auto &BB : F) {
+    maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB));
+  }
+  if (maxLoopDepth != 2) {
+    return false;
+  }
+
+  // Check if the fmuladd.f32 inline function is used
+  bool hasFMulAdd = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) {
+        hasFMulAdd = true;
+        break;
+      }
+    }
+    if (hasFMulAdd)
+      break;
+  }
+  if (!hasFMulAdd) {
+    return false;
+  }
+
+  BasicBlock *Entry = getBasicBlockByName(F, "entry");
+  BasicBlock *ForCondCleanup = getBasicBlockByName(F, "for.cond.cleanup");
+
+  if (!Entry || !ForCondCleanup)
+    return false;
+
+  if (Entry->getTerminator()->getSuccessor(1) != ForCondCleanup)
+    return false;
+
+  // Check if there are three outer loops, each with one inner loop
+  int outerLoopCount = 0;
+  int innerLoopCount = 0;
+  for (Loop *L : LI->getLoopsInPreorder()) {
+    if (L->getLoopDepth() == 1) {
+      outerLoopCount++;
+    } else if (L->getLoopDepth() == 2) {
+      innerLoopCount++;
+    } else {
+      return false;
+    }
+  }
+
+  if (outerLoopCount != 1 || innerLoopCount != 3) {
+    return false;
+  }
+
+  return true;
+}
+
+static bool shouldUnrollFirType(Function &F, LoopInfo *LI) {
+  // Check the number of basic blocks
+  if (F.size() != 19)
+    return false;
+
+  // Check the number of parameters
+  if (F.arg_size() != 4) {
+    return false;
+  }
+
+  // Check the loop nesting level
+  unsigned int maxLoopDepth = 0;
+  for (auto &BB : F) {
+    maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB));
+  }
+  if (maxLoopDepth != 2) {
+    return false;
+  }
+
+  // Check if the fmuladd.f32 inline function is used
+  bool hasFMulAdd = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) {
+        hasFMulAdd = true;
+        break;
+      }
+    }
+    if (hasFMulAdd)
+      break;
+  }
+  if (!hasFMulAdd) {
+    return false;
+  }
+
+  BasicBlock *Entry = getBasicBlockByName(F, "entry");
+  BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader");
+  BasicBlock *ForBodyLrPh = getBasicBlockByName(F, "for.body.lr.ph");
+  BasicBlock *IfEnd = getBasicBlockByName(F, "if.end");
+  BasicBlock *ForBody = getBasicBlockByName(F, "for.body");
+  BasicBlock *ForBodyClone = getBasicBlockByName(F, "for.body.clone");
+  BasicBlock *ForBodyLrPhClone = getBasicBlockByName(F, "for.body.lr.ph.clone");
+
+  if (!Entry || !ForCondPreheader || !ForBodyLrPh || !IfEnd || !ForBody ||
+      !ForBodyClone || !ForBodyLrPhClone)
+    return false;
+
+  if (Entry->getTerminator()->getSuccessor(0) != ForCondPreheader ||
+      Entry->getTerminator()->getSuccessor(1) != ForBodyLrPhClone ||
+      ForCondPreheader->getTerminator()->getSuccessor(0) != ForBodyLrPh ||
+      ForCondPreheader->getTerminator()->getSuccessor(1) != IfEnd ||
+      ForBodyLrPh->getSingleSuccessor() != ForBody ||
+      ForBodyLrPhClone->getSingleSuccessor() != ForBodyClone)
+    return false;
+
+  // Check if there are three outer loops, each with one inner loop
+  int outerLoopCount = 0;
+  int innerLoopCount = 0;
+  for (Loop *L : LI->getLoopsInPreorder()) {
+    if (L->getLoopDepth() == 1) {
+      outerLoopCount++;
+    } else if (L->getLoopDepth() == 2) {
+      innerLoopCount++;
+    } else {
+      return false;
+    }
+  }
+  // for opt is 4, for clang is 2.
+  if (outerLoopCount != 2 || (innerLoopCount != 2 && innerLoopCount != 4)) {
+    return false;
+  }
+
+  return true;
+}
+
+static void eraseAllStoreInstInBB(BasicBlock *BB) {
+  assert(BB && "BasicBlock is nullptr");
+  // Erase all store instructions in BB
+  for (auto it = BB->begin(); it != BB->end();) {
+    if (isa<StoreInst>(&*it)) {
+      it = it->eraseFromParent();
+    } else {
+      ++it;
+    }
+  }
+}
+
+static GetElementPtrInst *getUniqueGetElementPtrInst(BasicBlock *BB) {
+  assert(BB && "BasicBlock is nullptr");
+  // Get the unique getelementptr instruction in BB
+  GetElementPtrInst *GEP = nullptr;
+  for (Instruction &I : *BB) {
+    if (auto *GEPI = dyn_cast<GetElementPtrInst>(&I)) {
+      if (!GEP) {
+        GEP = GEPI;
+      } else {
+        // If multiple getelementptr instructions are found, set GEP to nullptr
+        // and exit the loop
+        GEP = nullptr;
+        break;
+      }
+    }
+  }
+  assert(GEP && "getelementptr instruction not found");
+  return GEP;
+}
+
+static void createCriticalEdgeAndMoveStoreInst(BasicBlock *CloneForBody,
+                                               BasicBlock *ForEnd37) {
+  CloneForBody->getTerminator()->setSuccessor(1, CloneForBody);
+  // Create a new BasicBlock: for.cond.for.end_crit_edge
+  BasicBlock *CriticalEdge = BasicBlock::Create(
+      CloneForBody->getContext(), "for.cond.for.end_crit_edge",
+      CloneForBody->getParent(), ForEnd37);
+
+  // Update the terminator instruction of CloneForBody
+  CloneForBody->getTerminator()->setSuccessor(0, CriticalEdge);
+
+  // Create an unconditional branch instruction to jump to OldForEnd
+  BranchInst::Create(ForEnd37, CriticalEdge);
+
+  // Find and move the StoreInst in CloneForBody to CriticalEdge
+  StoreInst *StoreToMove = nullptr;
+  for (auto &Inst : *CloneForBody) {
+    if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+      StoreToMove = Store;
+      break;
+    }
+  }
+
+  if (StoreToMove) {
+    StoreToMove->removeFromParent();
+    StoreToMove->insertBefore(CriticalEdge->getTerminator());
+  }
+}
+static std::tuple<Value *, GetElementPtrInst *, Value *>
+modifyOuterLoop4(Loop *L, BasicBlock *ForBodyMerged,
+                 BasicBlock *CloneForBodyPreheader) {
+  BasicBlock *BB = L->getHeader();
+  PHINode *phi = getLastPhi(BB);
+  // Add new instructions
+  IRBuilder<> Builder(BB);
+  Builder.SetInsertPoint(phi->getNextNode());
+
+  // and i32 %n.0551, -8
+  Value *Add2 = Builder.CreateAnd(phi, ConstantInt::get(phi->getType(), -8));
+
+  // %sub = and i32 %n.0551, 2147483644
+  Value *Sub =
+      Builder.CreateAnd(phi, ConstantInt::get(phi->getType(), 2147483640));
+
+  // %cmp12538.not = icmp eq i32 %sub, 0
+  Value *Cmp = Builder.CreateICmpEQ(Sub, ConstantInt::get(phi->getType(), 0));
+
+  // br i1 %cmp12538.not, label %for.cond.cleanup, label %for.body.preheader
+  // Move the conditional branch instruction to the end of BB
+  auto *newcondBr =
+      Builder.CreateCondBr(Cmp, CloneForBodyPreheader, ForBodyMerged);
+
+  // Erase the terminator instruction of BB
+  Instruction *oldTerminator = BB->getTerminator();
+  newcondBr->moveAfter(oldTerminator);
+  oldTerminator->eraseFromParent();
+
+  // Erase all store instructions in BB
+  eraseAllStoreInstInBB(BB);
+  for (PHINode &Phi : ForBodyMerged->phis()) {
+    Phi.setIncomingBlock(1, CloneForBodyPreheader);
+  }
+  // Get the unique getelementptr instruction in BB
+  GetElementPtrInst *GEP = getUniqueGetElementPtrInst(BB);
+  return std::make_tuple(Sub, GEP, Add2);
+}
+
+static void modifyInnerLoop4(Loop *L, BasicBlock *ForBodyMerged, Value *Sub,
+                             BasicBlock *CloneForBody, GetElementPtrInst *GEP,
+                             Value *Add2, BasicBlock *CloneForBodyPreheader) {
+  BasicBlock *OuterBB = L->getHeader();
+  SmallVector<CallInst *, 16> FMulAddCalls;
+  insertPhiNodesForFMulAdd(ForBodyMerged, OuterBB, FMulAddCalls);
+  movePHINodesToTop(*ForBodyMerged);
+
+  groupAndReorderInstructions(ForBodyMerged);
+  ICmpInst *LastICmp = getLastICmpInst(ForBodyMerged);
+  LastICmp->setPredicate(ICmpInst::ICMP_ULT);
+  LastICmp->setOperand(1, Sub);
+  swapTerminatorSuccessors(ForBodyMerged);
+  eraseAllStoreInstInBB(ForBodyMerged);
+
+  Function *F = ForBodyMerged->getParent();
+
+  BasicBlock *NewForEnd =
+      BasicBlock::Create(F->getContext(), "for.end", F, ForBodyMerged);
+  NewForEnd->moveAfter(ForBodyMerged);
+
+  // Create an instruction to add the results of four FMulAdd calls
+  assert(FMulAddCalls.size() == 8 && "Expected 8 FMulAdd calls");
+  Value *Sum = nullptr;
+  Value *sum = BinaryOperator::CreateFAdd(FMulAddCalls[0], FMulAddCalls[1],
+                                          "sum", NewForEnd);
+  Value *sum23 = BinaryOperator::CreateFAdd(FMulAddCalls[2], FMulAddCalls[3],
+                                            "sum23", NewForEnd);
+  Value *sum24 = BinaryOperator::CreateFAdd(FMulAddCalls[4], FMulAddCalls[5],
+                                            "sum24", NewForEnd);
+  Value *sum25 = BinaryOperator::CreateFAdd(FMulAddCalls[6], FMulAddCalls[7],
+                                            "sum25", NewForEnd);
+  Value *sum26 = BinaryOperator::CreateFAdd(sum, sum23, "sum26", NewForEnd);
+  Value *sum27 = BinaryOperator::CreateFAdd(sum24, sum25, "sum27", NewForEnd);
+  Sum = BinaryOperator::CreateFAdd(sum26, sum27, "sum28", NewForEnd);
+  IRBuilder<> Builder(NewForEnd);
+  Builder.SetInsertPoint(NewForEnd);
+  // Create a new StoreInst instruction
+  Builder.CreateStore(Sum, GEP);
+  // Create a comparison instruction
+  Value *Cmp = Builder.CreateICmpUGT(Add2, GEP->getOperand(1), "cmp37.not548");
+
+  // Create a conditional branch instruction
+  Builder.CreateCondBr(Cmp, ForBodyMerged->getTerminator()->getSuccessor(1),
+                       CloneForBodyPreheader);
+  ForBodyMerged->getTerminator()->setSuccessor(1, NewForEnd);
+  CloneForBodyPreheader->moveAfter(NewForEnd);
+  CloneForBody->moveAfter(CloneForBodyPreheader);
+
+  // Create a PHI node in CloneForBodyPreheader
+  PHINode *SumPHI = PHINode::Create(Sum->getType(), 2, "sum.phi",
+                                    CloneForBodyPreheader->getFirstNonPHI());
+
+  // Set the incoming values of the PHI node
+  SumPHI->addIncoming(ConstantFP::get(Sum->getType(), 0.0), OuterBB);
+  SumPHI->addIncoming(Sum, NewForEnd);
+
+  // Create a PHI node in CloneForBodyPreheader
+  PHINode *AddPHI = PHINode::Create(Add2->getType(), 2, "add.phi",
+                                    CloneForBodyPreheader->getFirstNonPHI());
+
+  // Set the incoming values of the PHI node
+  AddPHI->addIncoming(ConstantInt::get(Add2->getType(), 0), OuterBB);
+  AddPHI->addIncoming(Add2, NewForEnd);
+  Value *phifloatincomingvalue0 =
+      getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32");
+  Value *phii32incomingvalue0 = getLastICmpInst(CloneForBody)->getOperand(0);
+  for (PHINode &Phi : CloneForBody->phis()) {
+    if (Phi.getType()->isIntegerTy(32)) {
+      Phi.setIncomingValue(0, AddPHI);
+      Phi.setIncomingBlock(0, CloneForBodyPreheader);
+      Phi.setIncomingValue(1, phii32incomingvalue0);
+      Phi.setIncomingBlock(1, CloneForBody);
+    } else if (Phi.getType()->isFloatTy()) {
+      Phi.setIncomingValue(0, SumPHI);
+      Phi.setIncomingBlock(0, CloneForBodyPreheader);
+      Phi.setIncomingValue(1, phifloatincomingvalue0);
+      Phi.setIncomingBlock(1, CloneForBody);
+    }
+  }
+  BasicBlock *OldForEnd = CloneForBody->getTerminator()->getSuccessor(0);
+  createCriticalEdgeAndMoveStoreInst(CloneForBody, OldForEnd);
+
+  getFirstI32Phi(ForBodyMerged)->setIncomingBlock(1, ForBodyMerged);
+}
+
+static std::tuple<Value *, Value *, GetElementPtrInst *>
+modifyOuterLoop8(Loop *L) {
+  BasicBlock *BB = L->getHeader();
+  ICmpInst *LastICmp = getLastICmpInst(BB);
+  LastICmp->setPredicate(ICmpInst::ICMP_ULT);
+  swapTerminatorSuccessors(BB);
+
+  eraseAllStoreInstInBB(BB);
+  Value *lsig_0 = getFirstI32Phi(BB)->getIncomingValue(0);
+  Value *add207 = LastICmp->getOperand(0);
+  Value *sub206 = cast<Instruction>(add207)->getOperand(0);
+  // Add new instructions before LastICmp
+  IRBuilder<> Builder(LastICmp);
+
+  // %add207.neg = xor i32 %sub206, -1
+  Value *Add207Neg = Builder.CreateXor(
+      sub206, ConstantInt::get(sub206->getType(), -1), "add207.neg");
+
+  // %add211 = add i32 %lsig.0, %add207.neg
+  Value *Add211 = Builder.CreateAdd(lsig_0, Add207Neg, "add211");
+
+  // %div212535 = and i32 %add211, -8
+  Value *Div212535 = Builder.CreateAnd(
+      Add211, ConstantInt::get(Add211->getType(), -8), "div212535");
+
+  // %add214 = add i32 %div212535, %add207
+  Value *Add214 = Builder.CreateAdd(Div212535, add207, "add214");
+
+  // Set the second operand of LastICmp to Add214
+  LastICmp->setOperand(1, Add214);
+
+  // Get the unique getelementptr instruction in BB
+  GetElementPtrInst *GEP = getUniqueGetElementPtrInst(BB);
+
+  return std::make_tuple(Add214, add207, GEP);
+}
+
+static std::tuple<Value *, Value *, GetElementPtrInst *>
+modifyOuterLoop16(Loop *L) {
+  BasicBlock *BB = L->getHeader();
+  BasicBlock *BBLoopPreHeader = L->getLoopPreheader();
+  ICmpInst *LastICmp = getLastICmpInst(BB);
+  LastICmp->setPredicate(ICmpInst::ICMP_ULT);
+  swapTerminatorSuccessors(BB);
+
+  eraseAllStoreInstInBB(BB);
+  Value *lkern_0 = getFirstI32Phi(BB)->getIncomingValue(1);
+  // Insert an and instruction in BBLoopPreHeader
+  IRBuilder<> Builder(BBLoopPreHeader->getTerminator());
+  Value *Div536 = Builder.CreateAnd(lkern_0, -16, "div536");
+  // Get the first operand of LastICmp
+  Value *Add56 = LastICmp->getOperand(0);
+
+  // Create an add instruction before LastICmp
+  Builder.SetInsertPoint(LastICmp);
+  Value *Add60 = Builder.CreateAdd(Div536, Add56, "add60");
+
+  // Set the second operand of LastICmp to Add60
+  LastICmp->setOperand(1, Add60);
+
+  // Get the unique getelementptr instruction in BB
+  GetElementPtrInst *GEP = getUniqueGetElementPtrInst(BB);
+
+  return std::make_tuple(Add60, Add56, GEP);
+}
+
+static void modifyInnerLoop(Loop *L, BasicBlock *ForBodyMerged, Value *Add60,
+                            BasicBlock *CloneForBody, Value *Add56,
+                            GetElementPtrInst *GEP, uint32_t unroll_count) {
+  assert((unroll_count == 8 || unroll_count == 16) &&
+         "unroll_count must be 8 or 16");
+  BasicBlock *OuterBB = L->getHeader();
+
+  // Find the predecessor BasicBlock of ForBodyMergedPreheader
+  BasicBlock *PredBB = ForBodyMerged->getSinglePredecessor();
+  if (!PredBB) {
+    // If there is no single predecessor, traverse all predecessors
+    for (BasicBlock *Pred : predecessors(ForBodyMerged)) {
+      PredBB = Pred;
+      break; // Take the first predecessor
+    }
+  }
+  assert(PredBB && "can't find predecessor of ForBodyMerged");
+
+  SmallVector<CallInst *, 16> FMulAddCalls;
+  insertPhiNodesForFMulAdd(ForBodyMerged, PredBB, FMulAddCalls);
+
+  movePHINodesToTop(*ForBodyMerged);
+
+  groupAndReorderInstructions(ForBodyMerged);
+  ICmpInst *LastICmp = getLastICmpInst(ForBodyMerged);
+  LastICmp->setPredicate(ICmpInst::ICMP_ULT);
+  LastICmp->setOperand(1, Add60);
+  swapTerminatorSuccessors(ForBodyMerged);
+  eraseAllStoreInstInBB(ForBodyMerged);
+
+  BasicBlock *ForEndLoopExit = ForBodyMerged->getTerminator()->getSuccessor(1);
+  // Create an instruction to add the results of four FMulAdd calls
+  Value *Sum = nullptr;
+  if (unroll_count == 16) {
+    Value *sum45 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[0], FMulAddCalls[1], "sum45",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum46 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[2], FMulAddCalls[3], "sum46",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum47 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[4], FMulAddCalls[5], "sum47",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum48 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[6], FMulAddCalls[7], "sum48",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum49 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[8], FMulAddCalls[9], "sum49",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum50 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[10], FMulAddCalls[11], "sum50",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum51 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[12], FMulAddCalls[13], "sum51",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum52 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[14], FMulAddCalls[15], "sum52",
+                                   ForEndLoopExit->getTerminator());
+
+    Value *sum53 = BinaryOperator::CreateFAdd(sum45, sum46, "sum53",
+                                              ForEndLoopExit->getTerminator());
+    Value *sum54 = BinaryOperator::CreateFAdd(sum47, sum48, "sum54",
+                                              ForEndLoopExit->getTerminator());
+    Value *sum55 = BinaryOperator::CreateFAdd(sum49, sum50, "sum55",
+                                              ForEndLoopExit->getTerminator());
+    Value *sum56 = BinaryOperator::CreateFAdd(sum51, sum52, "sum56",
+                                              ForEndLoopExit->getTerminator());
+
+    Value *sum57 = BinaryOperator::CreateFAdd(sum53, sum54, "sum57",
+                                              ForEndLoopExit->getTerminator());
+    Value *sum58 = BinaryOperator::CreateFAdd(sum55, sum56, "sum58",
+                                              ForEndLoopExit->getTerminator());
+
+    Sum = BinaryOperator::CreateFAdd(sum57, sum58, "sum59",
+                                     ForEndLoopExit->getTerminator());
+  } else if (unroll_count == 8) {
+    Value *sum60 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[0], FMulAddCalls[1], "sum60",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum61 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[2], FMulAddCalls[3], "sum61",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum62 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[4], FMulAddCalls[5], "sum62",
+                                   ForEndLoopExit->getTerminator());
+    Value *sum63 =
+        BinaryOperator::CreateFAdd(FMulAddCalls[6], FMulAddCalls[7], "sum63",
+                                   ForEndLoopExit->getTerminator());
+
+    Value *sum64 = BinaryOperator::CreateFAdd(sum60, sum61, "sum64",
+                                              ForEndLoopExit->getTerminator());
+    Value *sum65 = BinaryOperator::CreateFAdd(sum62, sum63, "sum65",
+                                              ForEndLoopExit->getTerminator());
+    Sum = BinaryOperator::CreateFAdd(sum64, sum65, "sum66",
+                                     ForEndLoopExit->getTerminator());
+  }
+
+  // Create a new basic block for.end164
+  BasicBlock *ForEnd164 = BasicBlock::Create(
+      ForEndLoopExit->getContext(), "for.end164", ForEndLoopExit->getParent(),
+      ForEndLoopExit->getNextNode());
+
+  // Set the target of the terminator instruction of ForEndLoopExit to
+  // for.end164
+  Instruction *Terminator = ForEndLoopExit->getTerminator();
+  BasicBlock *OldSuccessor = Terminator->getSuccessor(0);
+  Terminator->setSuccessor(0, ForEnd164);
+
+  // Create an unconditional branch instruction in for.end164, jumping to the
+  // original successor basic block
+  BranchInst::Create(OldSuccessor, ForEnd164);
+
+  // Create a new phi node in for.end164
+  PHINode *PhiSum = PHINode::Create(Type::getInt32Ty(ForEnd164->getContext()),
+                                    2, "phi.sum", ForEnd164->getFirstNonPHI());
+
+  // Set the incoming values of the phi node
+  PhiSum->addIncoming(Add56, OuterBB);
+  PhiSum->addIncoming(LastICmp->getOperand(0), ForEndLoopExit);
+
+  // Create a new phi float node in for.end164
+  PHINode *PhiFloat =
+      PHINode::Create(Type::getFloatTy(ForEnd164->getContext()), 2, "phi.float",
+                      ForEnd164->getFirstNonPHI());
+
+  // Set the incoming values of the phi node
+  PhiFloat->addIncoming(
+      ConstantFP::get(Type::getFloatTy(ForEnd164->getContext()), 0.0), OuterBB);
+  PhiFloat->addIncoming(Sum, ForEndLoopExit);
+  // Create a new StoreInst instruction in for.end164
+  new StoreInst(PhiFloat, GEP, ForEnd164->getTerminator());
+
+  Value *operand1 = unroll_count == 16
+                        ? getFirstI32Phi(OuterBB)
+                        : getLastICmpInst(CloneForBody)->getOperand(1);
+  // Create a new comparison instruction
+  ICmpInst *NewCmp =
+      new ICmpInst(ICmpInst::ICMP_UGT, PhiSum, operand1, "cmp182.not587");
+  NewCmp->insertBefore(ForEnd164->getTerminator());
+
+  // Replace the original unconditional branch with a conditional branch
+  BranchInst *OldBr = cast<BranchInst>(ForEnd164->getTerminator());
+  BasicBlock *ForEnd37 = OldBr->getSuccessor(0);
+  BranchInst *NewBr = BranchInst::Create(ForEnd37, CloneForBody, NewCmp);
+  ReplaceInstWithInst(OldBr, NewBr);
+
+  CloneForBody->moveAfter(ForEnd164);
+  Instruction *TargetInst =
+      getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32");
+  for (PHINode &Phi : CloneForBody->phis()) {
+    if (Phi.getType()->isIntegerTy(32)) {
+      Phi.setIncomingValue(0, getLastICmpInst(CloneForBody)->getOperand(0));
+      Phi.setIncomingBlock(0, CloneForBody);
+      Phi.setIncomingValue(1, PhiSum);
+      Phi.setIncomingBlock(1, ForEnd164);
+    } else if (Phi.getType()->isFloatTy()) {
+      Phi.setIncomingValue(0, TargetInst);
+      Phi.setIncomingBlock(0, CloneForBody);
+      Phi.setIncomingValue(1, PhiFloat);
+      Phi.setIncomingBlock(1, ForEnd164);
+    }
+  }
+
+  createCriticalEdgeAndMoveStoreInst(CloneForBody, ForEnd37);
+
+  OuterBB->getTerminator()->setSuccessor(1, ForEnd164);
+}
+
+static void PostUnrollConv(Function &F, Loop *L, int unroll_count,
+                           int unroll_index) {
+  BasicBlock *ForBody = L->getHeader();
+  BasicBlock *CloneForBody =
+      cloneBasicBlockWithRelations(ForBody, ".clone", &F);
+  CloneForBody->moveAfter(ForBody);
+  // Set the second branch of the terminator instruction of CloneForBody to
+  // ForBody
+  CloneForBody->getTerminator()->setSuccessor(1, ForBody);
+
+  StringRef ForBodyName = ForBody->getName();
+  // Get the basic blocks to merge
+  std::vector<BasicBlock *> BBsToMerge;
+  for (int i = 1; i < unroll_count; ++i) {
+    std::string BBName = (ForBodyName + "." + std::to_string(i)).str();
+    BasicBlock *ForBodyClone = getBasicBlockByName(F, BBName);
+    if (ForBodyClone) {
+      BBsToMerge.push_back(ForBodyClone);
+    }
+  }
+
+  if (BBsToMerge.size() == static_cast<size_t>(unroll_count - 1)) {
+    for (BasicBlock *BB : BBsToMerge) {
+      MergeBasicBlockIntoOnlyPred(BB);
+    }
+  }
+  // Get the outer loop of L
+  Loop *OuterLoop = L->getParentLoop();
+  if (unroll_count == 8 && unroll_index == 0) {
+    BasicBlock *CloneForBodyPreheader = BasicBlock::Create(
+        CloneForBody->getContext(), CloneForBody->getName() + ".preheader",
+        CloneForBody->getParent(), CloneForBody);
+
+    updatePredecessorsToPreheader(CloneForBody, CloneForBodyPreheader);
+    auto [Sub, GEP, Add2] =
+        modifyOuterLoop4(OuterLoop, BBsToMerge[6], CloneForBodyPreheader);
+    modifyInnerLoop4(OuterLoop, BBsToMerge[6], Sub, CloneForBody, GEP, Add2,
+                     CloneForBodyPreheader);
+  } else if (unroll_count == 16) {
+    auto [Add60, Add56, GEP] = modifyOuterLoop16(OuterLoop);
+    modifyInnerLoop(OuterLoop, BBsToMerge[14], Add60, CloneForBody, Add56, GEP,
+                    unroll_count);
+  } else if (unroll_count == 8) {
+    auto [Add214, Add207, GEP] = modifyOuterLoop8(OuterLoop);
+    modifyInnerLoop(OuterLoop, BBsToMerge[6], Add214, CloneForBody, Add207, GEP,
+                    unroll_count);
+  }
+  LLVM_DEBUG(F.dump());
+}
+
+static void modifyFirstCloneForBody(BasicBlock *CloneForBody,
+                                    PHINode *N_0_lcssa,
+                                    BasicBlock *ForBody27LrPh,
+                                    PHINode *CoeffPosLcssa, Value *Operand1) {
+  CloneForBody->getTerminator()->setSuccessor(1, CloneForBody);
+  for (PHINode &Phi : CloneForBody->phis()) {
+    Phi.setIncomingBlock(0, ForBody27LrPh);
+    Phi.setIncomingBlock(1, CloneForBody);
+  }
+  PHINode *FirstI32Phi = getFirstI32Phi(CloneForBody);
+  PHINode *LastI32Phi = getLastI32Phi(CloneForBody);
+  FirstI32Phi->setIncomingValue(0, N_0_lcssa);
+  FirstI32Phi->setIncomingBlock(0, ForBody27LrPh);
+
+  Instruction *firstAddInst = nullptr;
+  Instruction *lastAddInst = nullptr;
+  for (Instruction &I : *CloneForBody) {
+    if (I.getOpcode() == Instruction::Add) {
+      if (!firstAddInst) {
+        firstAddInst = &I;
+      }
+      lastAddInst = &I;
+    }
+  }
+  ICmpInst *LastCmpInst = getLastICmpInst(CloneForBody);
+  LastCmpInst->setOperand(0, lastAddInst);
+  LastCmpInst->setOperand(1, Operand1);
+  FirstI32Phi->setIncomingValue(1, lastAddInst);
+
+  LastI32Phi->setIncomingValue(0, CoeffPosLcssa);
+  LastI32Phi->setIncomingBlock(0, ForBody27LrPh);
+
+  LastI32Phi->setIncomingValue(1, firstAddInst);
+}
+
+static bool setBBFromOtherBB(Function &F, StringRef BBName,
+                             BasicBlock *ForBodyMerged) {
+  // Find the first and last load instructions in ForBody27LrPh
+  LoadInst *FirstLoad = nullptr;
+  LoadInst *LastLoad = nullptr;
+  BasicBlock *ForBody27LrPh = getBasicBlockByName(F, BBName);
+  for (Instruction &I : *ForBody27LrPh) {
+    if (auto *LI = dyn_cast<LoadInst>(&I)) {
+      if (!FirstLoad) {
+        FirstLoad = LI;
+      }
+      LastLoad = LI;
+    }
+  }
+
+  assert(FirstLoad && LastLoad && "Find  load instructions in ForBody27LrPh");
+
+  // modify getelementptr
+  // Traverse the GEP instructions in ForBodyMerged
+  std::vector<GetElementPtrInst *> GEPInsts;
+  for (Instruction &I : *ForBodyMerged) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      GEPInsts.push_back(GEP);
+    }
+  }
+  // Ensure there is at least one GEP instruction
+  if (!GEPInsts.empty()) {
+    for (size_t i = 0; i < GEPInsts.size(); ++i) {
+      GetElementPtrInst *CurrentGEP = GEPInsts[i];
+
+      if (i % 2 == 1) { // Odd
+        CurrentGEP->setOperand(0, LastLoad);
+      } else { // Even
+        CurrentGEP->setOperand(0, FirstLoad);
+      }
+    }
+  }
+  return true;
+}
+
+// Function to modify the first loop in FIRD (Finite Impulse Response Design)
+// transformation
+static void modifyFirdFirstLoop(Function &F, Loop *L, BasicBlock *ForBodyMerged,
+                                BasicBlock *CloneForBody) {
+  BasicBlock *ForCond23Preheader =
+      ForBodyMerged->getTerminator()->getSuccessor(0)->getSingleSuccessor();
+  assert(ForCond23Preheader &&
+         "ForCondPreheader should have single predecessor");
+
+  BasicBlock *ForCondCleanup3 =
+      getFirstI32Phi(ForCond23Preheader)->getIncomingBlock(0);
+  Instruction *FirstI32Phi = getFirstI32Phi(ForCondCleanup3);
+
+  ICmpInst *LastICmp = getLastICmpInst(ForCondCleanup3);
+  // Create new add instruction
+  IRBuilder<> Builder(LastICmp);
+  Value *Add269 = Builder.CreateNSWAdd(
+      FirstI32Phi, ConstantInt::get(FirstI32Phi->getType(), 8), "add269");
+  LastICmp->setOperand(0, Add269);
+  LastICmp->setPredicate(ICmpInst::ICMP_SGT);
+  swapTerminatorSuccessors(ForCondCleanup3);
+
+  PHINode *N_069 = getFirstI32Phi(ForBodyMerged);
+  Value *Inc20_7 = N_069->getIncomingValue(1);
+  BasicBlock *ForBodyMergedLoopPreheader = N_069->getIncomingBlock(0);
+  // Create new phi node at the beginning of ForBodyMerged
+  PHINode *Add281 = PHINode::Create(Type::getInt32Ty(F.getContext()), 2,
+                                    "add281", &ForBodyMerged->front());
+
+  // Set incoming values for phi node
+  Add281->addIncoming(Add269, ForBodyMergedLoopPreheader);
+  Add281->addIncoming(Inc20_7, ForBodyMerged);
+
+  N_069->setIncomingValue(1, Add281);
+
+  ICmpInst *LastICmpInPreheader = getLastICmpInst(ForCond23Preheader);
+  // Create new phi node
+  PHINode *N_0_lcssa = PHINode::Create(Type::getInt32Ty(F.getContext()), 2,
+                                       "n.0.lcssa", LastICmpInPreheader);
+
+  // Set incoming values for phi node
+  N_0_lcssa->addIncoming(FirstI32Phi, ForCondCleanup3);
+  N_0_lcssa->addIncoming(Add281, ForBodyMerged);
+
+  // Replace operand of LastICmpInPreheader with new phi node
+  LastICmpInPreheader->setOperand(0, N_0_lcssa);
+  LastICmpInPreheader->setPredicate(ICmpInst::ICMP_SLT);
+
+  Value *Operand1 = LastICmp->getOperand(1);
+  LastICmpInPreheader->setOperand(1, Operand1);
+
+  // Get %coeff_pos.0.lcssa
+  PHINode *CoeffPosLcssa = getFirstI32Phi(ForCond23Preheader);
+
+  // Insert new add instruction at the end of ForBodyMergedLoopPreheader
+  BasicBlock *ForBody27LrPh =
+      ForCond23Preheader->getTerminator()->getSuccessor(0);
+  Builder.SetInsertPoint(ForBody27LrPh->getTerminator());
+  Value *Add11 = Builder.CreateAdd(Operand1, CoeffPosLcssa);
+
+  ForBody27LrPh->getTerminator()->setSuccessor(0, CloneForBody);
+  ICmpInst *LastICmpInForBodyMerged = getLastICmpInst(ForBodyMerged);
+  LastICmpInForBodyMerged->setOperand(1, Operand1);
+  LastICmpInForBodyMerged->setOperand(0, Inc20_7);
+
+  modifyFirstCloneForBody(CloneForBody, N_0_lcssa, ForBody27LrPh, CoeffPosLcssa,
+                          Operand1);
+
+  PHINode *acc_0_lcssa = getFirstFloatPhi(ForCond23Preheader);
+  BasicBlock *ForCond23PreheaderLoopExit = acc_0_lcssa->getIncomingBlock(1);
+  PHINode *_lcssa = getFirstFloatPhi(ForCond23PreheaderLoopExit);
+  acc_0_lcssa->setIncomingValue(1, _lcssa->getIncomingValue(0));
+  acc_0_lcssa->setIncomingBlock(1, _lcssa->getIncomingBlock(0));
+
+  Value *floatZero = acc_0_lcssa->getIncomingValue(0);
+
+  // Get all incoming values and blocks for PHINode
+  for (unsigned i = 1; i < _lcssa->getNumIncomingValues(); ++i) {
+    Value *IncomingValue = _lcssa->getIncomingValue(i);
+    BasicBlock *IncomingBlock = _lcssa->getIncomingBlock(i);
+
+    // Create new phi node in ForCond23Preheader
+    PHINode *NewPhi =
+        PHINode::Create(floatZero->getType(), 2,
+                        "acc." + std::to_string(i) + ".lcssa", CoeffPosLcssa);
+    // Add incoming values
+    NewPhi->addIncoming(floatZero, ForCondCleanup3);
+    NewPhi->addIncoming(IncomingValue, IncomingBlock);
+  }
+  Value *coeff_pos_068 = getLastI32Phi(ForBodyMerged)->getIncomingValue(1);
+  CoeffPosLcssa->setIncomingValue(1, coeff_pos_068);
+
+  getLastFloatPhi(CloneForBody)->setIncomingValue(0, acc_0_lcssa);
+
+  BasicBlock *PredBB = ForBodyMerged->getSinglePredecessor();
+  if (!PredBB) {
+    // If no single predecessor, iterate through all predecessors
+    for (BasicBlock *Pred : predecessors(ForBodyMerged)) {
+      PredBB = Pred;
+      break; // Only take first predecessor
+    }
+  }
+  SmallVector<CallInst *, 8> FMulAddCalls;
+  // insertPhiNodesForFMulAdd(ForBodyMerged, ForCond23PreHeader, FMulAddCalls);
+  // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader
+  for (Instruction &I : *ForBodyMerged) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (Function *F = CI->getCalledFunction()) {
+        if (F->getName() == "llvm.fmuladd.f32" && CI->isTailCall()) {
+          FMulAddCalls.push_back(CI);
+        }
+      }
+    }
+  }
+
+  // Insert phi nodes for each FMulAdd call
+  for (CallInst *CI : FMulAddCalls) {
+    // Create new phi node
+    PHINode *PHI = PHINode::Create(CI->getType(), 2, CI->getName() + "acc", CI);
+
+    // Set incoming values for phi node
+    PHI->addIncoming(ConstantFP::get(CI->getType(), 0), PredBB);
+    PHI->addIncoming(CI, ForBodyMerged);
+
+    CI->setOperand(2, PHI);
+  }
+  movePHINodesToTop(*ForBodyMerged);
+  modifyAddToOr(ForBodyMerged);
+
+  ICmpInst *LastICmpForBodyMerged = getLastICmpInst(ForBodyMerged);
+  LastICmpForBodyMerged->setPredicate(ICmpInst::ICMP_SGT);
+  cast<Instruction>(LastICmpForBodyMerged->getOperand(0))
+      ->setOperand(0, getFirstI32Phi(ForBodyMerged));
+
+  // Find first and last load instructions in ForBody14LrPh
+  LoadInst *FirstLoad = nullptr;
+  LoadInst *LastLoad = nullptr;
+  BasicBlock *ForBody14LrPh = getBasicBlockByName(F, "for.body14.lr.ph");
+  for (Instruction &I : *ForBody14LrPh) {
+    if (auto *LI = dyn_cast<LoadInst>(&I)) {
+      if (!FirstLoad) {
+        FirstLoad = LI;
+      }
+      LastLoad = LI;
+    }
+  }
+
+  assert(FirstLoad && LastLoad &&
+         "Failed to find load instructions in ForBody14LrPh");
+
+  // modify getelementptr
+  // Iterate through getelementptr instructions in ForBodyMerged
+  std::vector<GetElementPtrInst *> GEPInsts;
+  for (Instruction &I : *ForBodyMerged) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      GEPInsts.push_back(GEP);
+    }
+  }
+  // Ensure at least one getelementptr instruction exists
+  if (!GEPInsts.empty()) {
+    for (size_t i = 0; i < GEPInsts.size(); ++i) {
+      GetElementPtrInst *CurrentGEP = GEPInsts[i];
+
+      if (i % 2 == 1) { // Odd
+        CurrentGEP->setOperand(0, LastLoad);
+      } else { // Even
+        CurrentGEP->setOperand(0, FirstLoad);
+      }
+    }
+  }
+
+  // Ensure at least one getelementptr instruction exists
+  if (!GEPInsts.empty()) {
+    // Get first getelementptr instruction
+    GetElementPtrInst *SecondGEP = GEPInsts[1];
+
+    // Starting from index 1, process every other getelementptr
+    for (size_t i = 3; i < GEPInsts.size(); i += 2) {
+      GetElementPtrInst *CurrentGEP = GEPInsts[i];
+
+      // Set current getelementptr's operand 0 to first getelementptr's value
+      CurrentGEP->setOperand(0, SecondGEP);
+
+      // Set operand 1 to current index value
+      // ConstantInt *IndexValue =
+      // ConstantInt::get(CurrentGEP->getOperand(1)->getType(), i);
+      CurrentGEP->setOperand(
+          1, ConstantInt::get(CurrentGEP->getOperand(1)->getType(), (i) / 2));
+    }
+  }
+
+  setBBFromOtherBB(F, "for.body27.lr.ph", CloneForBody);
+
+  BasicBlock *ForCondCleanup26LoopExit = CloneForBody->getNextNode();
+  BasicBlock *ForCondCleanup26 = ForCondCleanup26LoopExit->getSingleSuccessor();
+  Instruction *tailcallInst =
+      getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32");
+
+  // Find add instruction in ForBody27LrPh
+  Instruction *AddInst = nullptr;
+  for (Instruction &I : *ForBody27LrPh) {
+    if (I.getOpcode() == Instruction::Add) {
+      AddInst = &I;
+      break;
+    }
+  }
+
+  // Insert new instructions in ForCondCleanup26LoopExit
+  Builder.SetInsertPoint(ForCondCleanup26LoopExit->getFirstNonPHI());
+  Value *SubResult = Builder.CreateSub(AddInst, N_0_lcssa);
+  PHINode *firstFloatPhi = getFirstFloatPhi(ForCondCleanup26);
+  firstFloatPhi->setIncomingValue(1, tailcallInst);
+
+  ForCond23Preheader->setName("for.cond63.preheader");
+  // Create new PHI node in ForCondCleanup26
+  PHINode *CoeffPosLcssaPhi =
+      PHINode::Create(CoeffPosLcssa->getType(), 2, "coeff_pos.1.lcssa",
+                      &ForCondCleanup26->front());
+
+  // Set incoming values and blocks for PHI node
+  CoeffPosLcssaPhi->addIncoming(CoeffPosLcssa, ForCond23Preheader);
+  CoeffPosLcssaPhi->addIncoming(SubResult, ForCondCleanup26LoopExit);
+  // eraseAllStoreInstInBB(ForCondCleanup26);
+
+  ICmpInst *LastICmpForCondCleanup26 = getLastICmpInst(ForCondCleanup26);
+
+  LastICmpForCondCleanup26->setPredicate(ICmpInst::ICMP_SLT);
+  PHINode *FirstI32ForCondCleanup3 = getFirstI32Phi(ForCondCleanup3);
+  LastICmpForCondCleanup26->setOperand(0, FirstI32ForCondCleanup3);
+  LastICmpForCondCleanup26->setOperand(
+      1,
+      ConstantInt::get(LastICmpForCondCleanup26->getOperand(1)->getType(), 8));
+
+  BasicBlock *ForBody79LrPh =
+      cloneBasicBlockWithRelations(ForBody27LrPh, ".clone", &F);
+  ForBody79LrPh->setName("for.body79.lr.ph");
+  ForBody79LrPh->moveBefore(CloneForBody);
+  ForBody79LrPh->getTerminator()->setSuccessor(0, ForBodyMerged);
+  ForCondCleanup26->getTerminator()->setSuccessor(1, ForBody79LrPh);
+  // Create new and instruction in ForBody79LrPh
+  Builder.SetInsertPoint(ForBody79LrPh->getTerminator());
+  Value *AndResult = Builder.CreateAnd(
+      FirstI32ForCondCleanup3,
+      ConstantInt::get(FirstI32ForCondCleanup3->getType(), 2147483640));
+
+  BasicBlock *ForCond130Preheader =
+      cloneBasicBlockWithRelations(ForCond23Preheader, ".clone", &F);
+  ForCond130Preheader->setName("for.cond130.preheader");
+  ForCond130Preheader->moveAfter(CloneForBody);
+  ForCondCleanup26->getTerminator()->setSuccessor(0, ForCond130Preheader);
+  for (PHINode &Phi : ForCond130Preheader->phis()) {
+    Phi.setIncomingBlock(0, ForCondCleanup26);
+  }
+  // Iterate through phi nodes in ForCond130Preheader and ForCond23Preheader
+  // simultaneously
+  auto it130 = ForCond130Preheader->begin();
+  auto it23 = ForCond23Preheader->begin();
+
+  while (it130 != ForCond130Preheader->end() &&
+         it23 != ForCond23Preheader->end()) {
+    if (auto *phi130 = dyn_cast<PHINode>(&*it130)) {
+      if (auto *phi23 = dyn_cast<PHINode>(&*it23)) {
+        if (phi130->getType()->isFloatTy() && phi23->getType()->isFloatTy()) {
+          // Write phi float from ForCond23Preheader to incomingvalue 0 position
+          // in ForCond130Preheader
+          phi130->setIncomingValue(0, phi23);
+        }
+      }
+      ++it23;
+    }
+    ++it130;
+  }
+  getFirstFloatPhi(ForCond130Preheader)->setIncomingValue(0, firstFloatPhi);
+
+  getFirstI32Phi(ForCond130Preheader)
+      ->setIncomingValue(0, getFirstI32Phi(ForCondCleanup26));
+
+  PHINode *LastI32Phi130 = getLastI32Phi(ForCond130Preheader);
+  LastI32Phi130->setIncomingValue(
+      0, ConstantInt::get(getLastI32Phi(ForCond130Preheader)->getType(), 0));
+  LastI32Phi130->setIncomingValue(1, AndResult);
+
+  ICmpInst *LastICmp130 = getLastICmpInst(ForCond130Preheader);
+  LastICmp130->setOperand(1, FirstI32ForCondCleanup3);
+
+  PHINode *LastI32PhiClone = getLastFloatPhi(CloneForBody);
+  LastI32PhiClone->setIncomingValue(1, tailcallInst);
+
+  // modify for.cond23.preheader.loopexit
+  // modify for.cond63.preheader
+  for (PHINode &Phi : ForCond23Preheader->phis()) {
+    Phi.setIncomingBlock(1, ForBodyMerged);
+  }
+  ForBodyMerged->getTerminator()->setSuccessor(0, ForCond130Preheader);
+
+  CloneForBody->getTerminator()->setSuccessor(0, ForCondCleanup26LoopExit);
+
+  // Get for.cond.cleanup.loopexit basic block
+  BasicBlock *ForCondCleanupLoopExit =
+      getBasicBlockByName(F, "for.cond23.preheader.loopexit");
+
+  // Check if for.cond.cleanup.loopexit exists
+  if (ForCondCleanupLoopExit) {
+    // Check if for.cond.cleanup.loopexit has no predecessors
+    if (pred_empty(ForCondCleanupLoopExit)) {
+      // Delete for.cond.cleanup.loopexit basic block
+      ForCondCleanupLoopExit->eraseFromParent();
+    }
+  }
+
+  ForBodyMerged->getTerminator()->setSuccessor(0, ForCond23Preheader);
+}
+
+static bool copyFloatPhiIncomingValue(int i, BasicBlock *srcBB,
+                                      BasicBlock *tarBB) {
+  assert(srcBB && tarBB && "srcBB or tarBB should not be nullptr");
+  // Collect phi float nodes from ForCond130Preheader in reverse order into
+  // vector
+  SmallVector<Value *, 8> floatPhis;
+
+  for (auto it = srcBB->rbegin(); it != srcBB->rend(); ++it) {
+    if (PHINode *phi = dyn_cast<PHINode>(&*it)) {
+      if (phi->getType()->isFloatTy()) {
+        floatPhis.push_back(phi->getIncomingValue(i));
+      }
+    }
+  }
+
+  // Traverse phi float nodes in ForBodyMerged in reverse order and store values
+  // from floatPhis into their incoming value 0
+  auto floatPhiIt = floatPhis.begin();
+  for (auto it = tarBB->rbegin();
+       it != tarBB->rend() && floatPhiIt != floatPhis.end(); ++it) {
+    if (PHINode *phi = dyn_cast<PHINode>(&*it)) {
+      if (phi->getType()->isFloatTy()) {
+        phi->setIncomingValue(i, *floatPhiIt);
+        ++floatPhiIt;
+      }
+    }
+  }
+  return true;
+}
+
+static void modifyFirdSecondLoop(Function &F, Loop *L,
+                                 BasicBlock *ForBodyMerged,
+                                 BasicBlock *CloneForBody) {
+  BasicBlock *ForBody = L->getHeader();
+
+  BasicBlock *ForBody133LrPh =
+      BasicBlock::Create(CloneForBody->getContext(), "for.body133.lr.ph",
+                         CloneForBody->getParent(), CloneForBody);
+
+  updatePredecessorsToPreheader(CloneForBody, ForBody133LrPh);
+
+  BasicBlock *PredBB = ForBodyMerged->getSinglePredecessor();
+  if (!PredBB) {
+    // If there is no single predecessor, iterate through all predecessors
+    for (BasicBlock *Pred : predecessors(ForBodyMerged)) {
+      PredBB = Pred;
+      break; // Only take the first predecessor
+    }
+  }
+  SmallVector<CallInst *, 8> FMulAddCalls;
+  // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader
+  for (Instruction &I : *ForBodyMerged) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (Function *F = CI->getCalledFunction()) {
+        if (F->getName() == "llvm.fmuladd.f32" && CI->isTailCall()) {
+          FMulAddCalls.push_back(CI);
+        }
+      }
+    }
+  }
+
+  // Insert phi nodes for each FMulAdd call
+  for (CallInst *CI : FMulAddCalls) {
+    // Create new phi node
+    PHINode *PHI = PHINode::Create(CI->getType(), 2, CI->getName() + "acc", CI);
+
+    // Set incoming values for phi node
+    PHI->addIncoming(ConstantFP::get(CI->getType(), 0), PredBB);
+    PHI->addIncoming(CI, ForBodyMerged);
+
+    CI->setOperand(2, PHI);
+  }
+  PHINode *n22_075 = getFirstI32Phi(ForBodyMerged);
+  // Create new phi node in ForBodyMerged
+  PHINode *Add76310 = PHINode::Create(Type::getInt32Ty(F.getContext()), 2,
+                                      "add76310", &ForBodyMerged->front());
+  Add76310->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 8),
+                        ForBody133LrPh);
+  n22_075->setIncomingValue(1, Add76310);
+  // Create new add instruction in ForBodyMerged
+  IRBuilder<> Builder(ForBodyMerged->getTerminator());
+  Value *Add76 = Builder.CreateAdd(
+      Add76310, ConstantInt::get(Type::getInt32Ty(F.getContext()), 8), "add76",
+      true, true);
+
+  // Update phi node's loop edge
+  Add76310->addIncoming(Add76, ForBodyMerged);
+
+  movePHINodesToTop(*ForBodyMerged);
+  modifyAddToOr(ForBodyMerged);
+
+  ICmpInst *LastICmp = getLastICmpInst(ForBodyMerged);
+  LastICmp->setPredicate(ICmpInst::ICMP_SGT);
+  cast<Instruction>(Add76)->moveBefore(LastICmp);
+  LastICmp->setOperand(0, Add76);
+  for (PHINode &Phi : ForBodyMerged->phis()) {
+    Phi.setIncomingBlock(0, PredBB);
+  }
+
+  BasicBlock *NewForEnd141 =
+      BasicBlock::Create(F.getContext(), "for.end141", &F, CloneForBody);
+  NewForEnd141->moveAfter(CloneForBody);
+
+  BasicBlock *ForCond1Preheader = getBasicBlockByName(F, "for.cond1.preheader");
+  for (PHINode &Phi : ForCond1Preheader->phis()) {
+    Phi.setIncomingBlock(1, NewForEnd141);
+  }
+  PHINode *ForCond1PreheaderLastI32Phi = getLastI32Phi(ForCond1Preheader);
+  // Insert new add instruction in NewForEnd141
+  Builder.SetInsertPoint(NewForEnd141);
+  Value *Inc152 =
+      Builder.CreateAdd(ForCond1PreheaderLastI32Phi,
+                        ConstantInt::get(Type::getInt32Ty(F.getContext()), 1),
+                        "inc152", true, true);
+  Inc152->setName("inc152");
+
+  // Update PHI nodes in ForCond1Preheader
+  ForCond1PreheaderLastI32Phi->setIncomingValue(1, Inc152);
+
+  BasicBlock *ForCondCleanup = getBasicBlockByName(F, "for.cond.cleanup");
+  getFirstI32Phi(ForCondCleanup)->setIncomingBlock(1, NewForEnd141);
+
+  // Find len parameter in function F
+  Value *LenArg = getLenFromEntryBlock(F);
+  assert(LenArg && "LenArg should be");
+
+  // Create comparison instruction
+  Value *ExitCond350 = Builder.CreateICmpEQ(Inc152, LenArg, "exitcond350.not");
+
+  // Create conditional branch instruction
+  Builder.CreateCondBr(ExitCond350, ForCondCleanup, ForCond1Preheader);
+
+  BasicBlock *ForCond130Preheader =
+      getBasicBlockByName(F, "for.cond130.preheader");
+  for (PHINode &phi : ForCond130Preheader->phis()) {
+    phi.setIncomingBlock(1, ForBodyMerged);
+  }
+  ForCond130Preheader->getTerminator()->setSuccessor(0, ForBody133LrPh);
+  ForCond130Preheader->getTerminator()->setSuccessor(1, NewForEnd141);
+
+  // ForBody133LrPh
+  // Create new instructions in ForBody133LrPh
+  BasicBlock *ForBody79LrPh = getBasicBlockByName(F, "for.body79.lr.ph");
+  ForBody79LrPh->getTerminator()->setSuccessor(0, ForBodyMerged);
+  // Copy loadinst from ForBody79LrPh to ForBody133LrPh
+  Builder.SetInsertPoint(ForBody133LrPh->getTerminator());
+  for (Instruction &I : *ForBody79LrPh) {
+    if (isa<LoadInst>(I)) {
+      Instruction *ClonedInst = I.clone();
+      ClonedInst->setName(I.getName());
+      Builder.Insert(ClonedInst);
+    }
+  }
+
+  // modify ForBodyMerged
+  for (PHINode &Phi : ForBodyMerged->phis()) {
+    Phi.setIncomingBlock(0, ForBody79LrPh);
+  }
+
+  PHINode *coeff_pos174 = getLastI32Phi(ForBodyMerged);
+  PHINode *coeff_pos_0_lcssa_clone = getFirstI32Phi(ForCond130Preheader);
+  coeff_pos_0_lcssa_clone->setIncomingValue(1,
+                                            coeff_pos174->getIncomingValue(1));
+  coeff_pos174->setIncomingValue(0,
+                                 coeff_pos_0_lcssa_clone->getIncomingValue(0));
+
+  bool res = copyFloatPhiIncomingValue(0, ForCond130Preheader, ForBodyMerged);
+  assert(res && "copyFloatPhiIncomingZeroValue failed");
+
+  bool res1 = copyFloatPhiIncomingValue(1, ForBodyMerged, ForCond130Preheader);
+  assert(res1 && "copyFloatPhiIncomingValue failed");
+  // Find first and last load instructions in ForBody79LrPh
+  LoadInst *FirstLoad = nullptr;
+  LoadInst *LastLoad = nullptr;
+
+  for (Instruction &I : *ForBody79LrPh) {
+    if (auto *LI = dyn_cast<LoadInst>(&I)) {
+      if (!FirstLoad) {
+        FirstLoad = LI;
+      }
+      LastLoad = LI;
+    }
+  }
+
+  assert(FirstLoad && LastLoad &&
+         "Could not find load instructions in ForBody79LrPh");
+  // Iterate through GetElementPtrInst
+  std::vector<GetElementPtrInst *> GEPInsts;
+  for (Instruction &I : *ForBodyMerged) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      GEPInsts.push_back(GEP);
+    }
+  }
+
+  // Ensure there is at least one getelementptr instruction
+  if (!GEPInsts.empty()) {
+    for (size_t i = 0; i < GEPInsts.size(); ++i) {
+      GetElementPtrInst *CurrentGEP = GEPInsts[i];
+
+      if (i % 2 == 1) { // odd
+        CurrentGEP->setOperand(0, LastLoad);
+      } else { // even
+        CurrentGEP->setOperand(0, FirstLoad);
+      }
+    }
+  }
+
+  // Ensure there is at least one getelementptr instruction
+  if (!GEPInsts.empty()) {
+    // Get first getelementptr instruction
+    GetElementPtrInst *FirstGEP = GEPInsts[0];
+
+    // Starting from index 1, process every other getelementptr
+    for (size_t i = 2; i < GEPInsts.size(); i += 2) {
+      GetElementPtrInst *CurrentGEP = GEPInsts[i];
+
+      // Set current getelementptr's operand 0 to first getelementptr's value
+      CurrentGEP->setOperand(0, FirstGEP);
+
+      // Set operand 1 to current index value
+      CurrentGEP->setOperand(
+          1, ConstantInt::get(CurrentGEP->getOperand(1)->getType(), (i) / 2));
+    }
+  }
+
+  ForBodyMerged->getTerminator()->setSuccessor(0, ForCond130Preheader);
+
+  // modify for.body27.clone
+  PHINode *n_0_lcssa_clone = getLastI32Phi(ForCond130Preheader);
+  PHINode *acc_0_lcssa_clone = getFirstFloatPhi(ForCond130Preheader);
+  Instruction *tailcallInst =
+      getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32");
+  Instruction *firstAddInst = nullptr;
+  Instruction *lastAddInst = nullptr;
+  for (Instruction &I : *CloneForBody) {
+    if (I.getOpcode() == Instruction::Add) {
+      if (!firstAddInst) {
+        firstAddInst = &I;
+      }
+      lastAddInst = &I;
+    }
+  }
+  int index = 0;
+  for (PHINode &Phi : CloneForBody->phis()) {
+    Phi.setIncomingBlock(0, ForBody133LrPh);
+    Phi.setIncomingBlock(1, CloneForBody);
+    if (index == 0) {
+      Phi.setIncomingValue(0, n_0_lcssa_clone);
+      Phi.setIncomingValue(1, lastAddInst);
+    } else if (index == 1) {
+      Phi.setIncomingValue(0, coeff_pos_0_lcssa_clone);
+      Phi.setIncomingValue(1, firstAddInst);
+    } else if (index == 2) {
+      Phi.setIncomingValue(0, acc_0_lcssa_clone);
+      Phi.setIncomingValue(1, tailcallInst);
+    }
+    index++;
+  }
+
+  CloneForBody->getTerminator()->setSuccessor(0, NewForEnd141);
+  CloneForBody->getTerminator()->setSuccessor(1, CloneForBody);
+
+  // modify for.end141
+  // Create phi float node in NewForEnd141
+  PHINode *AccPhi = PHINode::Create(Type::getFloatTy(F.getContext()), 2,
+                                    "acc0.3.lcssa", &NewForEnd141->front());
+  AccPhi->addIncoming(acc_0_lcssa_clone, ForCond130Preheader);
+  AccPhi->addIncoming(tailcallInst, CloneForBody);
+
+  int i = 0;
+  Value *Sum = nullptr;
+  Instruction *insertPoint = AccPhi->getNextNode();
+  // Count the number of float type phi nodes in ForCond130Preheader
+  SmallVector<PHINode *, 8> floatPhis;
+  for (PHINode &phi : ForCond130Preheader->phis()) {
+    if (phi.getType()->isFloatTy()) {
+      floatPhis.push_back(&phi);
+    }
+  }
+  assert(floatPhis.size() == 8 &&
+         "Expected 8 float phi nodes in ForCond130Preheader");
+  // Create parallel add instructions for better performance
+  Value *Add60 =
+      BinaryOperator::CreateFAdd(floatPhis[1], AccPhi, "add60", insertPoint);
+  Value *Add61 = BinaryOperator::CreateFAdd(floatPhis[2], floatPhis[3], "add61",
+                                            insertPoint);
+  Value *Add62 = BinaryOperator::CreateFAdd(floatPhis[4], floatPhis[5], "add62",
+                                            insertPoint);
+  Value *Add63 = BinaryOperator::CreateFAdd(floatPhis[6], floatPhis[7], "add63",
+                                            insertPoint);
+  Value *Add64 = BinaryOperator::CreateFAdd(Add60, Add61, "add64", insertPoint);
+  Value *Add65 = BinaryOperator::CreateFAdd(Add62, Add63, "add65", insertPoint);
+  Value *Add66 = BinaryOperator::CreateFAdd(Add64, Add65, "add66", insertPoint);
+  Sum = Add66;
+
+  // Move getelementptr and store instructions from for.cond.cleanup26 to
+  // NewForEnd141
+  BasicBlock *ForCondCleanup26 = getBasicBlockByName(F, "for.cond.cleanup26");
+
+  SmallVector<Instruction *, 2> instructionsToMove;
+
+  // Collect instructions to move
+  for (Instruction &I : *ForCondCleanup26) {
+    if (isa<GetElementPtrInst>(I) || isa<StoreInst>(I)) {
+      instructionsToMove.push_back(&I);
+    }
+  }
+
+  // Move instructions
+  for (Instruction *I : instructionsToMove) {
+    I->moveBefore(insertPoint);
+    if (isa<StoreInst>(I)) {
+      I->setOperand(0, Sum);
+    }
+  }
+
+  // Update instructions that used moved instructions
+  for (Instruction &I : *NewForEnd141) {
+    I.replaceUsesOfWith(ForCondCleanup26, NewForEnd141);
+  }
+
+  // Get for.cond.cleanup.loopexit basic block
+  BasicBlock *ForCondCleanupLoopExit =
+      getBasicBlockByName(F, "for.cond.cleanup.loopexit");
+
+  // Check if for.cond.cleanup.loopexit exists
+  if (ForCondCleanupLoopExit) {
+    // Check if for.cond.cleanup.loopexit has no predecessors
+    if (pred_empty(ForCondCleanupLoopExit)) {
+      // Delete for.cond.cleanup.loopexit basic block
+      ForCondCleanupLoopExit->eraseFromParent();
+    }
+  }
+
+  setBBFromOtherBB(F, "for.body133.lr.ph", CloneForBody);
+}
+
+// Main function to perform FIRD unrolling
+static void PostUnrollFird(Function &F, Loop *L, int loop_index) {
+  BasicBlock *ForBody = L->getHeader();
+  BasicBlock *CloneForBody =
+      cloneBasicBlockWithRelations(ForBody, ".clone", &F);
+  CloneForBody->moveAfter(ForBody);
+  CloneForBody->getTerminator()->setSuccessor(1, ForBody);
+
+  // Merge basic blocks
+  std::vector<BasicBlock *> BBsToMerge;
+  for (int i = 1; i < 8; ++i) {
+    std::string BBName = (ForBody->getName() + "." + std::to_string(i)).str();
+    BasicBlock *ForBodyClone = getBasicBlockByName(F, BBName);
+    if (ForBodyClone) {
+      BBsToMerge.push_back(ForBodyClone);
+    } else {
+      llvm_unreachable("can't find ForBodyClone");
+    }
+  }
+  if (BBsToMerge.size() == 7) {
+    for (BasicBlock *BB : BBsToMerge) {
+      MergeBasicBlockIntoOnlyPred(BB);
+    }
+  }
+  BasicBlock *ForBodyMerged = BBsToMerge[6];
+  CloneForBody->moveAfter(ForBodyMerged);
+
+  // Perform loop-specific modifications
+  if (loop_index == 1) {
+    modifyFirdFirstLoop(F, L, ForBodyMerged, CloneForBody);
+  } else if (loop_index == 2) {
+    modifyFirdSecondLoop(F, L, ForBodyMerged, CloneForBody);
+  }
+}
+
+// Helper function to check if a loop is simple (single-level, innermost, and
+// outermost)
+static bool isSimpleLoop(const Loop *L) {
+  return L->getLoopDepth() == 1 && L->isInnermost() && L->isOutermost();
+}
+
+// Handle simple loops
+static bool handleSimpleLoop(Function &F, Loop *L, ScalarEvolution &SE,
+                             LoopInfo *LI, DominatorTree &DT,
+                             AssumptionCache &AC,
+                             const TargetTransformInfo &TTI,
+                             OptimizationRemarkEmitter &ORE) {
+  if (shouldUnrollLoopWithCount(F, L, SE)) {
+    LLVM_DEBUG(errs() << "Unrolling loop with count\n");
+    auto UnrollResult =
+        UnrollLoop(L,
+                   {/*Count*/ 8, /*Force*/ true, /*Runtime*/ false,
+                    /*AllowExpensiveTripCount*/ true,
+                    /*UnrollRemainder*/ true, true},
+                   LI, &SE, &DT, &AC, &TTI, /*ORE*/ &ORE, true);
+    postUnrollLoopWithCount(F, L, 8);
+    return true;
+  }
+
+  if (shouldUnrollComplexLoop(F, L, SE, DT, *LI)) {
+    LLVM_DEBUG(errs() << "Unrolling complex loop\n");
+    auto UnrollResult =
+        UnrollLoop(L,
+                   {/*Count*/ 8, /*Force*/ true, /*Runtime*/ false,
+                    /*AllowExpensiveTripCount*/ true,
+                    /*UnrollRemainder*/ true, true},
+                   LI, &SE, &DT, &AC, &TTI, /*ORE*/ &ORE, true);
+    postUnrollLoopWithVariable(F, L, 8);
+    return true;
+  }
+
+  if (shouldUnrollAddcType(F, LI)) {
+    LLVM_DEBUG(errs() << "Unrolling ADDC type loop\n");
+    unrollAddc(F, SE, L, 16);
+    currentUnrollType = UnrollType::ADD_ADDC_SUB_MUL_MULC_SQRT;
+    return true;
+  }
+
+  if (shouldUnrollDotprodType(F, LI)) {
+    LLVM_DEBUG(errs() << "Transforming dot product type loop\n");
+    currentUnrollType = UnrollType::DOTPROD;
+    transformOneLoopDepth(F);
+    return true;
+  }
+
+  LLVM_DEBUG(errs() << "No unrolling performed for this loop\n");
+  return false;
+}
+
+// Helper function to simplify loop and form LCSSA
+static void simplifyAndFormLCSSA(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                                 ScalarEvolution &SE, AssumptionCache &AC) {
+  simplifyLoop(L, &DT, LI, &SE, &AC, nullptr, false);
+  formLCSSARecursively(*L, DT, LI, &SE);
+}
+
+// Helper function to get CONV unroll factor
+static unsigned int getConvUnrollFactor(uint32_t unrollCount) {
+  static const unsigned int unrollFactors[] = {8, 16, 8};
+  return unrollFactors[unrollCount % 3];
+}
+
+// Handle CONV type unrolling
+static bool handleConvUnroll(Function &F, Loop *L, ScalarEvolution &SE,
+                             LoopInfo *LI, DominatorTree &DT,
+                             AssumptionCache &AC,
+                             const TargetTransformInfo &TTI,
+                             OptimizationRemarkEmitter &ORE,
+                             uint32_t &unrollCount) {
+  LLVM_DEBUG(errs() << "Unrolling CONV type loop\n");
+  currentUnrollType = UnrollType::CONV_CCORR;
+
+  unsigned int unrollFactor = getConvUnrollFactor(unrollCount);
+  simplifyAndFormLCSSA(L, DT, LI, SE, AC);
+
+  auto UnrollResult =
+      UnrollLoop(L, {unrollFactor, true, false, true, true, true}, LI, &SE, &DT,
+                 &AC, &TTI, &ORE, true);
+
+  unrollCount++;
+  return true;
+}
+
+// Handle FIRD type unrolling
+static bool handleFirdUnroll(Function &F, Loop *L, ScalarEvolution &SE,
+                             LoopInfo *LI, DominatorTree &DT,
+                             AssumptionCache &AC,
+                             const TargetTransformInfo &TTI,
+                             OptimizationRemarkEmitter &ORE,
+                             uint32_t &unroll_times) {
+  LLVM_DEBUG(errs() << "Unrolling FIRD type loop\n");
+  currentUnrollType = UnrollType::FIRD;
+
+  if (unroll_times == 0) {
+    unroll_times++;
+    return false;
+  }
+
+  simplifyAndFormLCSSA(L, DT, LI, SE, AC);
+
+  auto UnrollResult = UnrollLoop(L, {8, true, false, true, true, true}, LI, &SE,
+                                 &DT, &AC, &TTI, &ORE, false);
+
+  return true;
+}
+
+// Handle innermost loops
+static bool handleInnermostLoop(Function &F, Loop *L, ScalarEvolution &SE,
+                                LoopInfo *LI, DominatorTree &DT,
+                                AssumptionCache &AC,
+                                const TargetTransformInfo &TTI,
+                                OptimizationRemarkEmitter &ORE,
+                                uint32_t &unrollCount) {
+  if (shouldUnrollCorr(F, LI)) {
+    LLVM_DEBUG(errs() << "Unrolling correlation type loop\n");
+    unrollCorr(F, L, 16);
+    currentUnrollType = UnrollType::CORR;
+    return true;
+  }
+
+  if (shouldUnrollFirType(F, LI) || currentUnrollType == UnrollType::FIR) {
+    LLVM_DEBUG(errs() << "Transforming FIR type loop\n");
+    unrollFir(F, L);
+    currentUnrollType = UnrollType::FIR;
+    return true;
+  }
+
+  if (shouldUnrollConvccorr(F, LI) ||
+      currentUnrollType == UnrollType::CONV_CCORR) {
+    return handleConvUnroll(F, L, SE, LI, DT, AC, TTI, ORE, unrollCount);
+  }
+
+  if (shouldUnrollFird(F, LI) || currentUnrollType == UnrollType::FIRD) {
+    return handleFirdUnroll(F, L, SE, LI, DT, AC, TTI, ORE, unrollCount);
+  }
+
+  LLVM_DEBUG(errs() << "No unrolling performed for this innermost loop\n");
+  return false;
+}
+
+// Check if unrolling should be disabled
+static bool shouldDisableUnroll(const Loop *L) {
+  TransformationMode TM = hasUnrollTransformation(L);
+  return (TM & TM_Disable) != 0;
+}
+
+static LoopUnrollResult
+tryToUnrollLoop(Function &F, Loop *L, DominatorTree &DT, LoopInfo *LI,
+                ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
+                BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) {
+  // Initialize variables
+  bool changed = false;
+  static uint32_t unrollCount = 0;
+  // Handle single-level loops
+  if (isSimpleLoop(L)) {
+    changed = handleSimpleLoop(F, L, SE, LI, DT, AC, TTI, ORE);
+  }
+  // Handle innermost loops
+  else if (L->isInnermost()) {
+    changed = handleInnermostLoop(F, L, SE, LI, DT, AC, TTI, ORE, unrollCount);
+  }
+
+  // Check if unrolling should be disabled
+  if (shouldDisableUnroll(L)) {
+    return LoopUnrollResult::Unmodified;
+  }
+
+  return changed ? LoopUnrollResult::PartiallyUnrolled
+                 : LoopUnrollResult::Unmodified;
+}
+
+// Helper function to process CONV unroll type
+void processConvUnroll(Function &F, const SmallVector<Loop *, 4> &InnerLoops) {
+  static const int unroll_counts[] = {8, 16, 8};
+  static int unroll_index = 0;
+  for (auto *L : InnerLoops) {
+    PostUnrollConv(F, L, unroll_counts[unroll_index], unroll_index);
+    unroll_index = (unroll_index + 1) % 3;
+  }
+}
+
+// Helper function to process FIRD unroll type
+void processFirdUnroll(Function &F, const SmallVector<Loop *, 4> &InnerLoops) {
+  static int loop_index = 0;
+  for (auto &L : InnerLoops) {
+    if (loop_index == 0) {
+      loop_index++;
+      continue;
+    }
+    PostUnrollFird(F, L, loop_index);
+    loop_index++;
+  }
+}
+
+static void addCommonOptimizationPasses(Function &F) {
+  // Create necessary analysis managers
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+
+  // Create pass builder
+  PassBuilder PB;
+
+  // Register analyses
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  // Create function-level optimization pipeline
+  FunctionPassManager FPM;
+
+  if (currentUnrollType == UnrollType::CORR ||
+      currentUnrollType == UnrollType::FIRD)
+    FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass()));
+  FPM.addPass(EarlyCSEPass(true));
+  FPM.addPass(ReassociatePass());
+
+  FPM.run(F, FAM);
+}
+
+static void addLegacyCommonOptimizationPasses(Function &F) {
+  legacy::FunctionPassManager FPM(F.getParent());
+  FPM.add(createLoopSimplifyPass());
+  FPM.add(createLICMPass()); // Loop Invariant Code Motion
+
+  // Add SimplifyCFG pass with common options
+  FPM.add(createCFGSimplificationPass(
+      SimplifyCFGOptions()
+          .bonusInstThreshold(1) // Set instruction bonus threshold
+          .forwardSwitchCondToPhi(
+              true) // Allow forwarding switch conditions to phi
+          .convertSwitchToLookupTable(
+              true)                  // Allow converting switch to lookup table
+          .needCanonicalLoops(false) // Don't require canonical loop form
+          .hoistCommonInsts(true)    // Hoist common instructions
+          .sinkCommonInsts(true)     // Sink common instructions
+      ));
+
+  // Initialize and run passes
+  FPM.doInitialization();
+  FPM.run(F);
+  FPM.doFinalization();
+}
+
+PreservedAnalyses
+RISCVLoopUnrollAndRemainderPass::run(Function &F, FunctionAnalysisManager &AM) {
+  if (!EnableRISCVLoopUnrollAndRemainder || F.arg_empty())
+    return PreservedAnalyses::all();
+
+  addnoalias(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  if (LI.empty())
+    return PreservedAnalyses::all();
+
+  // Retrieve necessary analysis results
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  LoopAnalysisManager *LAM = nullptr;
+  if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
+    LAM = &LAMProxy->getManager();
+
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  ProfileSummaryInfo *PSI =
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto *BFI = (PSI && PSI->hasProfileSummary())
+                  ? &AM.getResult<BlockFrequencyAnalysis>(F)
+                  : nullptr;
+
+  bool Changed = false;
+
+  // Process loops in reverse order of LoopInfo
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
+  SmallVector<Loop *, 4> InnerLoops;
+
+  while (!Worklist.empty()) {
+    Loop &L = *Worklist.pop_back_val();
+    if (L.getBlocks().empty()) {
+      LLVM_DEBUG(errs() << "Skipping empty loop\n");
+      continue;
+    }
+
+    std::string LoopName = std::string(L.getName());
+    if (L.getName().contains(".clone"))
+      continue;
+
+    if (L.isInnermost()) {
+      InnerLoops.push_back(&L);
+    }
+
+    LoopUnrollResult Result =
+        tryToUnrollLoop(F, &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI);
+    Changed |= Result != LoopUnrollResult::Unmodified;
+
+    // Clear cached analysis results if loop was fully unrolled
+    if (LAM && Result == LoopUnrollResult::FullyUnrolled)
+      LAM->clear(L, LoopName);
+  }
+
+  // Post-processing for specific unroll types
+  if (currentUnrollType == UnrollType::CONV_CCORR) {
+    processConvUnroll(F, InnerLoops);
+  } else if (currentUnrollType == UnrollType::FIRD) {
+    processFirdUnroll(F, InnerLoops);
+  }
+
+  // Run dead code elimination
+  runDeadCodeElimination(F);
+  if (currentUnrollType != UnrollType::FIR)
+    addCommonOptimizationPasses(F);
+  if (currentUnrollType == UnrollType::FIRD) {
+    addLegacyCommonOptimizationPasses(F);
+  }
+  // Verify function
+  if (verifyFunction(F, &errs())) {
+    LLVM_DEBUG(errs() << "Function verification failed\n");
+    report_fatal_error("Function verification failed");
+  }
+
+  return Changed ? getLoopPassPreservedAnalyses() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h
new file mode 100644
index 00000000000000..9e941cae210ad1
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h
@@ -0,0 +1,42 @@
+//===- RISCVLoopUnrollAndRemainder.h - Loop Unrolling and Remainder Handling
+//------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RISCVLoopUnrollAndRemainder pass
+//
+// This pass performs loop unrolling and handles the remainder iterations.
+// It aims to improve performance by:
+// 1. Unrolling loops to reduce loop overhead and enable further optimizations
+// 2. Generating efficient code for handling any remaining iterations
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_RISCVLOOPUNROLLANDREMAINDER_H
+#define LLVM_TRANSFORMS_UTILS_RISCVLOOPUNROLLANDREMAINDER_H
+
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class RecurrenceDescriptor;
+extern cl::opt<bool> EnableRISCVLoopUnrollAndRemainder;
+class Function;
+
+struct RISCVLoopUnrollAndRemainderPass
+    : public PassInfoMixin<RISCVLoopUnrollAndRemainderPass> {
+  RISCVLoopUnrollAndRemainderPass() {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_RISCVLOOPUNROLLANDREMAINDER_H
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 6b30ce7f904bb5..0c70f5e67a5266 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "RISCV.h"
 #include "RISCVCustomLICM.h"
+#include "RISCVLoopUnrollAndRemainder.h"
 #include "RISCVMachineFunctionInfo.h"
 #include "RISCVSplitLoopByLength.h"
 #include "RISCVTargetObjectFile.h"
@@ -459,6 +460,10 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(
           FPM.addPass(RISCVCustomLICMPass());
           return true;
         }
+        if (Name == "riscv-loop-unroll-and-remainder") {
+          FPM.addPass(RISCVLoopUnrollAndRemainderPass());
+          return true;
+        }
         return false;
       });
 
@@ -467,9 +472,11 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(
         if(EnableEsp32P4Optimize && (Level == OptimizationLevel::O3 || Level == OptimizationLevel::O2)){
           EnableRISCVSplitLoopByLength = true;
           EnableRISCVCustomLICM = true;
+          EnableRISCVLoopUnrollAndRemainder = true;
           FunctionPassManager FPM;
           FPM.addPass(RISCVSplitLoopByLengthPass());
           FPM.addPass(RISCVCustomLICMPass());
+          FPM.addPass(RISCVLoopUnrollAndRemainderPass());
           PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         }
       });
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll
index 3960501c6ff11d..a608ae2933aecf 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, ptr noundef readonly %input2, ptr noundef writeonly %output, i32 noundef %len, i32 noundef %step1, i32 noundef %step2, i32 noundef %step_out) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_add_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[INPUT1:%.*]], ptr noundef readonly [[INPUT2:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[INPUT1:%.*]], ptr noalias noundef readonly [[INPUT2:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[INPUT1]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[INPUT2]], null
@@ -12,19 +12,159 @@ define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    br i1 [[OR_COND19]], label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[CMP720:%.*]] = icmp sgt i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[CMP720]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]]
+; CHECK:       for.cond.preheader.new:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[LEN]], -16
+; CHECK-NEXT:    [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16
+; CHECK-NEXT:    br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]]
+; CHECK:       for.cond.preheader.new2:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]]
+; CHECK:       for.body.modify:
+; CHECK-NEXT:    [[I_021_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[TMP1]] = add nuw i32 [[I_021_MODIFY]], 16
+; CHECK-NEXT:    [[ADD2:%.*]] = or disjoint i32 [[I_021_MODIFY]], 1
+; CHECK-NEXT:    [[ADD7:%.*]] = or disjoint i32 [[I_021_MODIFY]], 2
+; CHECK-NEXT:    [[ADD13:%.*]] = or disjoint i32 [[I_021_MODIFY]], 3
+; CHECK-NEXT:    [[ADD18:%.*]] = or disjoint i32 [[I_021_MODIFY]], 4
+; CHECK-NEXT:    [[ADD23:%.*]] = or disjoint i32 [[I_021_MODIFY]], 5
+; CHECK-NEXT:    [[ADD28:%.*]] = or disjoint i32 [[I_021_MODIFY]], 6
+; CHECK-NEXT:    [[ADD33:%.*]] = or disjoint i32 [[I_021_MODIFY]], 7
+; CHECK-NEXT:    [[ADD38:%.*]] = or disjoint i32 [[I_021_MODIFY]], 8
+; CHECK-NEXT:    [[ADD43:%.*]] = or disjoint i32 [[I_021_MODIFY]], 9
+; CHECK-NEXT:    [[ADD48:%.*]] = or disjoint i32 [[I_021_MODIFY]], 10
+; CHECK-NEXT:    [[ADD53:%.*]] = or disjoint i32 [[I_021_MODIFY]], 11
+; CHECK-NEXT:    [[ADD58:%.*]] = or disjoint i32 [[I_021_MODIFY]], 12
+; CHECK-NEXT:    [[ADD63:%.*]] = or disjoint i32 [[I_021_MODIFY]], 13
+; CHECK-NEXT:    [[ADD68:%.*]] = or disjoint i32 [[I_021_MODIFY]], 14
+; CHECK-NEXT:    [[ADD73:%.*]] = or disjoint i32 [[I_021_MODIFY]], 15
+; CHECK-NEXT:    [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[I_021_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX9_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[I_021_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX11_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_021_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD2]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD2]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD2]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD7]]
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD7]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD7]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD13]]
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD13]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD13]]
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD23]]
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD23]]
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD23]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD28]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD28]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD28]]
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD33]]
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD33]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD33]]
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD43]]
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD43]]
+; CHECK-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD43]]
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD48]]
+; CHECK-NEXT:    [[ARRAYIDX50:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD48]]
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD48]]
+; CHECK-NEXT:    [[ARRAYIDX54:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD53]]
+; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD53]]
+; CHECK-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD53]]
+; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX62:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD63]]
+; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD63]]
+; CHECK-NEXT:    [[ARRAYIDX67:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD63]]
+; CHECK-NEXT:    [[ARRAYIDX69:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD68]]
+; CHECK-NEXT:    [[ARRAYIDX70:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD68]]
+; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD68]]
+; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD73]]
+; CHECK-NEXT:    [[ARRAYIDX75:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD73]]
+; CHECK-NEXT:    [[ARRAYIDX77:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD73]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX40]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX49]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX50]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[ARRAYIDX54]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[ARRAYIDX55]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX59]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[ARRAYIDX60]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load float, ptr [[ARRAYIDX64]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX65]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX69]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX70]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX74]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX75]], align 4
+; CHECK-NEXT:    [[ADD_MODIFY:%.*]] = fadd float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[ADD11:%.*]] = fadd float [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[ADD21:%.*]] = fadd float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[ADD26:%.*]] = fadd float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[ADD36:%.*]] = fadd float [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[ADD41:%.*]] = fadd float [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[ADD46:%.*]] = fadd float [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[ADD51:%.*]] = fadd float [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[ADD56:%.*]] = fadd float [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[ADD61:%.*]] = fadd float [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[ADD66:%.*]] = fadd float [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[ADD71:%.*]] = fadd float [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[ADD76:%.*]] = fadd float [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    store float [[ADD_MODIFY]], ptr [[ARRAYIDX11_MODIFY]], align 4
+; CHECK-NEXT:    store float [[ADD5]], ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    store float [[ADD11]], ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    store float [[ADD16]], ptr [[ARRAYIDX17]], align 4
+; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    store float [[ADD26]], ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    store float [[ADD31]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    store float [[ADD36]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    store float [[ADD41]], ptr [[ARRAYIDX42]], align 4
+; CHECK-NEXT:    store float [[ADD46]], ptr [[ARRAYIDX47]], align 4
+; CHECK-NEXT:    store float [[ADD51]], ptr [[ARRAYIDX52]], align 4
+; CHECK-NEXT:    store float [[ADD56]], ptr [[ARRAYIDX57]], align 4
+; CHECK-NEXT:    store float [[ADD61]], ptr [[ARRAYIDX62]], align 4
+; CHECK-NEXT:    store float [[ADD66]], ptr [[ARRAYIDX67]], align 4
+; CHECK-NEXT:    store float [[ADD71]], ptr [[ARRAYIDX72]], align 4
+; CHECK-NEXT:    store float [[ADD76]], ptr [[ARRAYIDX77]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ]
+; CHECK-NEXT:    [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_021]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[I_021]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP34]], [[TMP35]]
 ; CHECK-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[I_021]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10]]
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX11]], align 4
@@ -35,11 +175,11 @@ define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    [[I_021_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[MUL_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL_CLONE]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
 ; CHECK-NEXT:    [[MUL8_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX9_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8_CLONE]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4
-; CHECK-NEXT:    [[ADD_CLONE:%.*]] = fadd float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4
+; CHECK-NEXT:    [[ADD_CLONE:%.*]] = fadd float [[TMP36]], [[TMP37]]
 ; CHECK-NEXT:    [[MUL10_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX11_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10_CLONE]]
 ; CHECK-NEXT:    store float [[ADD_CLONE]], ptr [[ARRAYIDX11_CLONE]], align 4
@@ -47,7 +187,7 @@ define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll
index dd35ce0373fc6d..bf98ec71686bc5 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, ptr noundef writeonly %output, i32 noundef %len, float noundef %C, i32 noundef %step_in, i32 noundef %step_out) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_addc_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[INPUT:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], float noundef [[C:%.*]], i32 noundef [[STEP_IN:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[INPUT:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], float noundef [[C:%.*]], i32 noundef [[STEP_IN:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[INPUT]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[OUTPUT]], null
@@ -10,16 +10,124 @@ define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, pt
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[CMP412:%.*]] = icmp sgt i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[CMP412]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]]
+; CHECK:       for.cond.preheader.new:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[LEN]], -16
+; CHECK-NEXT:    [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16
+; CHECK-NEXT:    br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]]
+; CHECK:       for.cond.preheader.new2:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]]
+; CHECK:       for.body.modify:
+; CHECK-NEXT:    [[I_013_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[TMP1]] = add nuw i32 [[I_013_MODIFY]], 16
+; CHECK-NEXT:    [[ADD2:%.*]] = or disjoint i32 [[I_013_MODIFY]], 1
+; CHECK-NEXT:    [[ADD6:%.*]] = or disjoint i32 [[I_013_MODIFY]], 2
+; CHECK-NEXT:    [[ADD10:%.*]] = or disjoint i32 [[I_013_MODIFY]], 3
+; CHECK-NEXT:    [[ADD14:%.*]] = or disjoint i32 [[I_013_MODIFY]], 4
+; CHECK-NEXT:    [[ADD18:%.*]] = or disjoint i32 [[I_013_MODIFY]], 5
+; CHECK-NEXT:    [[ADD22:%.*]] = or disjoint i32 [[I_013_MODIFY]], 6
+; CHECK-NEXT:    [[ADD26:%.*]] = or disjoint i32 [[I_013_MODIFY]], 7
+; CHECK-NEXT:    [[ADD30:%.*]] = or disjoint i32 [[I_013_MODIFY]], 8
+; CHECK-NEXT:    [[ADD34:%.*]] = or disjoint i32 [[I_013_MODIFY]], 9
+; CHECK-NEXT:    [[ADD38:%.*]] = or disjoint i32 [[I_013_MODIFY]], 10
+; CHECK-NEXT:    [[ADD42:%.*]] = or disjoint i32 [[I_013_MODIFY]], 11
+; CHECK-NEXT:    [[ADD46:%.*]] = or disjoint i32 [[I_013_MODIFY]], 12
+; CHECK-NEXT:    [[ADD50:%.*]] = or disjoint i32 [[I_013_MODIFY]], 13
+; CHECK-NEXT:    [[ADD54:%.*]] = or disjoint i32 [[I_013_MODIFY]], 14
+; CHECK-NEXT:    [[ADD58:%.*]] = or disjoint i32 [[I_013_MODIFY]], 15
+; CHECK-NEXT:    [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_013_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX6_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_013_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD2]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD2]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD6]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD6]]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD10]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD10]]
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX53:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX43]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX47]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX55]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX59]], align 4
+; CHECK-NEXT:    [[ADD_MODIFY:%.*]] = fadd float [[C]], [[TMP2]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd float [[C]], [[TMP3]]
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[C]], [[TMP4]]
+; CHECK-NEXT:    [[ADD12:%.*]] = fadd float [[C]], [[TMP5]]
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[C]], [[TMP6]]
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd float [[C]], [[TMP7]]
+; CHECK-NEXT:    [[ADD24:%.*]] = fadd float [[C]], [[TMP8]]
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[C]], [[TMP9]]
+; CHECK-NEXT:    [[ADD32:%.*]] = fadd float [[C]], [[TMP10]]
+; CHECK-NEXT:    [[ADD36:%.*]] = fadd float [[C]], [[TMP11]]
+; CHECK-NEXT:    [[ADD40:%.*]] = fadd float [[C]], [[TMP12]]
+; CHECK-NEXT:    [[ADD44:%.*]] = fadd float [[C]], [[TMP13]]
+; CHECK-NEXT:    [[ADD48:%.*]] = fadd float [[C]], [[TMP14]]
+; CHECK-NEXT:    [[ADD52:%.*]] = fadd float [[C]], [[TMP15]]
+; CHECK-NEXT:    [[ADD56:%.*]] = fadd float [[C]], [[TMP16]]
+; CHECK-NEXT:    [[ADD60:%.*]] = fadd float [[C]], [[TMP17]]
+; CHECK-NEXT:    store float [[ADD_MODIFY]], ptr [[ARRAYIDX6_MODIFY]], align 4
+; CHECK-NEXT:    store float [[ADD4]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    store float [[ADD8]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    store float [[ADD12]], ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    store float [[ADD16]], ptr [[ARRAYIDX17]], align 4
+; CHECK-NEXT:    store float [[ADD20]], ptr [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    store float [[ADD24]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    store float [[ADD32]], ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    store float [[ADD36]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    store float [[ADD40]], ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    store float [[ADD44]], ptr [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    store float [[ADD48]], ptr [[ARRAYIDX49]], align 4
+; CHECK-NEXT:    store float [[ADD52]], ptr [[ARRAYIDX53]], align 4
+; CHECK-NEXT:    store float [[ADD56]], ptr [[ARRAYIDX57]], align 4
+; CHECK-NEXT:    store float [[ADD60]], ptr [[ARRAYIDX61]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_013:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ]
+; CHECK-NEXT:    [[I_013:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_013]], [[STEP_IN]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[C]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[C]], [[TMP18]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[I_013]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL5]]
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX6]], align 4
@@ -30,8 +138,8 @@ define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, pt
 ; CHECK-NEXT:    [[I_013_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[MUL_CLONE:%.*]] = mul nsw i32 [[I_013_CLONE]], [[STEP_IN]]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL_CLONE]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
-; CHECK-NEXT:    [[ADD_CLONE:%.*]] = fadd float [[TMP1]], [[C]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[ADD_CLONE:%.*]] = fadd float [[C]], [[TMP19]]
 ; CHECK-NEXT:    [[MUL5_CLONE:%.*]] = mul nsw i32 [[I_013_CLONE]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX6_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL5_CLONE]]
 ; CHECK-NEXT:    store float [[ADD_CLONE]], ptr [[ARRAYIDX6_CLONE]], align 4
@@ -39,7 +147,7 @@ define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, pt
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll
index 11c9c556d526e6..0432a51dfbb38e 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_ccorr_f32_ansi(ptr noundef readonly %Signal, i32 noundef %siglen, ptr noundef readonly %Kernel, i32 noundef %kernlen, ptr noundef writeonly %corrvout) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_ccorr_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noundef writeonly [[CORRVOUT:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noalias noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noalias noundef writeonly [[CORRVOUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[SIGNAL]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[KERNEL]], null
@@ -21,36 +21,131 @@ define dso_local noundef i32 @dsps_ccorr_f32_ansi(ptr noundef readonly %Signal,
 ; CHECK-NEXT:    [[KERN_0:%.*]] = phi ptr [ [[SIGNAL]], [[IF_THEN8]] ], [ [[KERNEL]], [[IF_END6]] ]
 ; CHECK-NEXT:    [[SIG_0:%.*]] = phi ptr [ [[KERNEL]], [[IF_THEN8]] ], [ [[SIGNAL]], [[IF_END6]] ]
 ; CHECK-NEXT:    [[CMP10124:%.*]] = icmp sgt i32 [[LKERN_0]], 0
-; CHECK-NEXT:    br i1 [[CMP10124]], label [[FOR_BODY:%.*]], label [[FOR_COND22_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP10124]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND22_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond22.preheader.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND22_PREHEADER]]
 ; CHECK:       for.cond22.preheader:
 ; CHECK-NEXT:    [[CMP23128:%.*]] = icmp slt i32 [[LKERN_0]], [[LSIG_0]]
-; CHECK-NEXT:    br i1 [[CMP23128]], label [[FOR_BODY25:%.*]], label [[FOR_COND45_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP23128]], label [[FOR_BODY25_PREHEADER:%.*]], label [[FOR_COND45_PREHEADER:%.*]]
+; CHECK:       for.body25.preheader:
+; CHECK-NEXT:    [[DIV536:%.*]] = and i32 [[LKERN_0]], -16
+; CHECK-NEXT:    br label [[FOR_BODY25:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[IF_END9]] ]
-; CHECK-NEXT:    [[N_0125:%.*]] = phi i32 [ [[INC19:%.*]], [[FOR_END]] ], [ 0, [[IF_END9]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[N_0125]], -1
-; CHECK-NEXT:    [[SUB11:%.*]] = add nsw i32 [[LKERN_0]], [[TMP0]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_0125:%.*]] = phi i32 [ [[INC19:%.*]], [[FOR_END]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = and i32 [[N_0125]], -8
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[N_0125]], 2147483640
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[N_0125]], -1
+; CHECK-NEXT:    [[SUB11:%.*]] = add nsw i32 [[TMP3]], [[LKERN_0]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[CORRVOUT]], i32 [[N_0125]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    br label [[FOR_BODY14:%.*]]
-; CHECK:       for.body14:
-; CHECK-NEXT:    [[K_0123:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP4:%.*]], [[FOR_BODY14]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY14_CLONE_PREHEADER:%.*]], label [[FOR_BODY14_7:%.*]]
+; CHECK:       for.body14.7:
+; CHECK-NEXT:    [[K_0123:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC_7:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP20:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI1:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI2:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP22:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI3:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP23:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI5:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP26:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[DOTPHI7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP27:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[K_0123]], [[SUB11]]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[K_0123]], 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[INC]], [[SUB11]]
+; CHECK-NEXT:    [[INC_1:%.*]] = add nuw nsw i32 [[K_0123]], 2
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[INC_1]], [[SUB11]]
+; CHECK-NEXT:    [[INC_2:%.*]] = add nuw nsw i32 [[K_0123]], 3
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[INC_2]], [[SUB11]]
+; CHECK-NEXT:    [[INC_3:%.*]] = add nuw nsw i32 [[K_0123]], 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[INC_3]], [[SUB11]]
+; CHECK-NEXT:    [[INC_4:%.*]] = add nuw nsw i32 [[K_0123]], 5
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[INC_4]], [[SUB11]]
+; CHECK-NEXT:    [[INC_5:%.*]] = add nuw nsw i32 [[K_0123]], 6
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[INC_5]], [[SUB11]]
+; CHECK-NEXT:    [[INC_6:%.*]] = add nuw nsw i32 [[K_0123]], 7
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[INC_6]], [[SUB11]]
+; CHECK-NEXT:    [[INC_7]] = add nuw nsw i32 [[K_0123]], 8
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0123]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUB11]], [[K_0123]]
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[TMP4]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[TMP1]])
-; CHECK-NEXT:    store float [[TMP4]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[K_0123]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY14]]
+; CHECK-NEXT:    [[ARRAYIDX15_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC]]
+; CHECK-NEXT:    [[ARRAYIDX16_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_1]]
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_1]]
+; CHECK-NEXT:    [[ARRAYIDX16_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_2]]
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_2]]
+; CHECK-NEXT:    [[ARRAYIDX16_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_3]]
+; CHECK-NEXT:    [[ARRAYIDX15_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_3]]
+; CHECK-NEXT:    [[ARRAYIDX16_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_4]]
+; CHECK-NEXT:    [[ARRAYIDX15_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_4]]
+; CHECK-NEXT:    [[ARRAYIDX16_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_5]]
+; CHECK-NEXT:    [[ARRAYIDX15_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_5]]
+; CHECK-NEXT:    [[ARRAYIDX16_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_6]]
+; CHECK-NEXT:    [[ARRAYIDX15_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_6]]
+; CHECK-NEXT:    [[ARRAYIDX16_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX15_1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX16_1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX15_2]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16_2]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15_3]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX16_3]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX15_4]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX16_4]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX15_5]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX16_5]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX15_6]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX16_6]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX15_7]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX16_7]], align 4
+; CHECK-NEXT:    [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[DOTPHI]])
+; CHECK-NEXT:    [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[DOTPHI1]])
+; CHECK-NEXT:    [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[DOTPHI2]])
+; CHECK-NEXT:    [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[DOTPHI3]])
+; CHECK-NEXT:    [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[DOTPHI4]])
+; CHECK-NEXT:    [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[DOTPHI5]])
+; CHECK-NEXT:    [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP16]], float [[TMP17]], float [[DOTPHI6]])
+; CHECK-NEXT:    [[TMP27]] = tail call float @llvm.fmuladd.f32(float [[TMP18]], float [[TMP19]], float [[DOTPHI7]])
+; CHECK-NEXT:    [[EXITCOND_7:%.*]] = icmp ult i32 [[INC_7]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND_7]], label [[FOR_BODY14_7]], label [[FOR_END8:%.*]]
+; CHECK:       for.end8:
+; CHECK-NEXT:    [[SUM:%.*]] = fadd float [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[SUM23:%.*]] = fadd float [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[SUM24:%.*]] = fadd float [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[SUM25:%.*]] = fadd float [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[SUM26:%.*]] = fadd float [[SUM]], [[SUM23]]
+; CHECK-NEXT:    [[SUM27:%.*]] = fadd float [[SUM24]], [[SUM25]]
+; CHECK-NEXT:    [[SUM28:%.*]] = fadd float [[SUM26]], [[SUM27]]
+; CHECK-NEXT:    store float [[SUM28]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br i1 false, label [[FOR_END]], label [[FOR_BODY14_CLONE_PREHEADER]]
+; CHECK:       for.body14.clone.preheader:
+; CHECK-NEXT:    [[SUM_PHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[SUM28]], [[FOR_END8]] ]
+; CHECK-NEXT:    [[ADD_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP0]], [[FOR_END8]] ]
+; CHECK-NEXT:    br label [[FOR_BODY14_CLONE:%.*]]
+; CHECK:       for.body14.clone:
+; CHECK-NEXT:    [[K_0123_CLONE:%.*]] = phi i32 [ [[ADD_PHI]], [[FOR_BODY14_CLONE_PREHEADER]] ], [ [[INC_CLONE:%.*]], [[FOR_BODY14_CLONE]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = phi float [ [[SUM_PHI]], [[FOR_BODY14_CLONE_PREHEADER]] ], [ [[TMP31:%.*]], [[FOR_BODY14_CLONE]] ]
+; CHECK-NEXT:    [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0123_CLONE]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4
+; CHECK-NEXT:    [[ADD_CLONE:%.*]] = add i32 [[K_0123_CLONE]], [[SUB11]]
+; CHECK-NEXT:    [[ARRAYIDX16_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_CLONE]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX16_CLONE]], align 4
+; CHECK-NEXT:    [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[TMP28]])
+; CHECK-NEXT:    [[INC_CLONE]] = add nuw nsw i32 [[K_0123_CLONE]], 1
+; CHECK-NEXT:    [[EXITCOND_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[INDVARS_IV]]
+; CHECK-NEXT:    br i1 [[EXITCOND_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY14_CLONE]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    store float [[TMP31]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[INC19]] = add nuw nsw i32 [[N_0125]], 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i32 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND134_NOT:%.*]] = icmp eq i32 [[INC19]], [[LKERN_0]]
-; CHECK-NEXT:    br i1 [[EXITCOND134_NOT]], label [[FOR_COND22_PREHEADER]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND134_NOT]], label [[FOR_COND22_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond45.preheader.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND45_PREHEADER]]
 ; CHECK:       for.cond45.preheader:
 ; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[SIGLEN]], -1
 ; CHECK-NEXT:    [[SUB47:%.*]] = add i32 [[ADD46]], [[KERNLEN]]
@@ -60,57 +155,308 @@ define dso_local noundef i32 @dsps_ccorr_f32_ansi(ptr noundef readonly %Signal,
 ; CHECK-NEXT:    [[SUB57:%.*]] = add nsw i32 [[LSIG_0]], -1
 ; CHECK-NEXT:    br label [[FOR_BODY50:%.*]]
 ; CHECK:       for.body25:
-; CHECK-NEXT:    [[N21_0129:%.*]] = phi i32 [ [[INC42:%.*]], [[FOR_END40:%.*]] ], [ [[LKERN_0]], [[FOR_COND22_PREHEADER]] ]
+; CHECK-NEXT:    [[N21_0129:%.*]] = phi i32 [ [[INC42:%.*]], [[FOR_END40:%.*]] ], [ [[LKERN_0]], [[FOR_BODY25_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[CORRVOUT]], i32 [[N21_0129]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX28]], align 4
 ; CHECK-NEXT:    [[SUB29:%.*]] = sub nuw nsw i32 [[N21_0129]], [[LKERN_0]]
 ; CHECK-NEXT:    [[ADD30:%.*]] = add nsw i32 [[SUB29]], 1
-; CHECK-NEXT:    [[CMP32_NOT126:%.*]] = icmp ugt i32 [[ADD30]], [[N21_0129]]
-; CHECK-NEXT:    br i1 [[CMP32_NOT126]], label [[FOR_END40]], label [[FOR_BODY33:%.*]]
-; CHECK:       for.body33:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP8:%.*]], [[FOR_BODY33]] ], [ 0.000000e+00, [[FOR_BODY25]] ]
-; CHECK-NEXT:    [[K27_0127:%.*]] = phi i32 [ [[INC39:%.*]], [[FOR_BODY33]] ], [ [[ADD30]], [[FOR_BODY25]] ]
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K27_0127]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ADD60:%.*]] = add i32 [[ADD30]], [[DIV536]]
+; CHECK-NEXT:    [[CMP32_NOT126:%.*]] = icmp ult i32 [[ADD30]], [[ADD60]]
+; CHECK-NEXT:    br i1 [[CMP32_NOT126]], label [[FOR_BODY33_PREHEADER:%.*]], label [[FOR_END164:%.*]]
+; CHECK:       for.body33.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY33_15:%.*]]
+; CHECK:       for.body33.15:
+; CHECK-NEXT:    [[K27_0127:%.*]] = phi i32 [ [[ADD30]], [[FOR_BODY33_PREHEADER]] ], [ [[INC39_15:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP64:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI10:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP65:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI11:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP66:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI12:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP67:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI13:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP68:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI14:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP69:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI15:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP70:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI16:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP71:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI17:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP72:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI18:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP73:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI19:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP74:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI20:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP75:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI21:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP76:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI22:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP77:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI23:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP78:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[DOTPHI24:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP79:%.*]], [[FOR_BODY33_15]] ]
+; CHECK-NEXT:    [[INC39:%.*]] = add i32 [[K27_0127]], 1
+; CHECK-NEXT:    [[INC39_1:%.*]] = add i32 [[K27_0127]], 2
+; CHECK-NEXT:    [[INC39_2:%.*]] = add i32 [[K27_0127]], 3
+; CHECK-NEXT:    [[INC39_3:%.*]] = add i32 [[K27_0127]], 4
+; CHECK-NEXT:    [[INC39_4:%.*]] = add i32 [[K27_0127]], 5
+; CHECK-NEXT:    [[INC39_5:%.*]] = add i32 [[K27_0127]], 6
+; CHECK-NEXT:    [[INC39_6:%.*]] = add i32 [[K27_0127]], 7
+; CHECK-NEXT:    [[INC39_7:%.*]] = add i32 [[K27_0127]], 8
+; CHECK-NEXT:    [[INC39_8:%.*]] = add i32 [[K27_0127]], 9
+; CHECK-NEXT:    [[INC39_9:%.*]] = add i32 [[K27_0127]], 10
+; CHECK-NEXT:    [[INC39_10:%.*]] = add i32 [[K27_0127]], 11
+; CHECK-NEXT:    [[INC39_11:%.*]] = add i32 [[K27_0127]], 12
+; CHECK-NEXT:    [[INC39_12:%.*]] = add i32 [[K27_0127]], 13
+; CHECK-NEXT:    [[INC39_13:%.*]] = add i32 [[K27_0127]], 14
+; CHECK-NEXT:    [[INC39_14:%.*]] = add i32 [[K27_0127]], 15
+; CHECK-NEXT:    [[INC39_15]] = add i32 [[K27_0127]], 16
 ; CHECK-NEXT:    [[SUB35:%.*]] = sub i32 [[K27_0127]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_1:%.*]] = sub i32 [[INC39]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_2:%.*]] = sub i32 [[INC39_1]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_3:%.*]] = sub i32 [[INC39_2]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_4:%.*]] = sub i32 [[INC39_3]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_5:%.*]] = sub i32 [[INC39_4]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_6:%.*]] = sub i32 [[INC39_5]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_7:%.*]] = sub i32 [[INC39_6]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_8:%.*]] = sub i32 [[INC39_7]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_9:%.*]] = sub i32 [[INC39_8]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_10:%.*]] = sub i32 [[INC39_9]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_11:%.*]] = sub i32 [[INC39_10]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_12:%.*]] = sub i32 [[INC39_11]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_13:%.*]] = sub i32 [[INC39_12]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_14:%.*]] = sub i32 [[INC39_13]], [[ADD30]]
+; CHECK-NEXT:    [[SUB35_15:%.*]] = sub i32 [[INC39_14]], [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K27_0127]]
 ; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX36]], align 4
-; CHECK-NEXT:    [[TMP8]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[TMP5]])
-; CHECK-NEXT:    store float [[TMP8]], ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[INC39]] = add i32 [[K27_0127]], 1
-; CHECK-NEXT:    [[CMP32_NOT:%.*]] = icmp ugt i32 [[INC39]], [[N21_0129]]
-; CHECK-NEXT:    br i1 [[CMP32_NOT]], label [[FOR_END40]], label [[FOR_BODY33]]
+; CHECK-NEXT:    [[ARRAYIDX34_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39]]
+; CHECK-NEXT:    [[ARRAYIDX36_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_1]]
+; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_1]]
+; CHECK-NEXT:    [[ARRAYIDX36_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_2]]
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_2]]
+; CHECK-NEXT:    [[ARRAYIDX36_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_3]]
+; CHECK-NEXT:    [[ARRAYIDX34_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_3]]
+; CHECK-NEXT:    [[ARRAYIDX36_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_4]]
+; CHECK-NEXT:    [[ARRAYIDX34_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_4]]
+; CHECK-NEXT:    [[ARRAYIDX36_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_5]]
+; CHECK-NEXT:    [[ARRAYIDX34_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_5]]
+; CHECK-NEXT:    [[ARRAYIDX36_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_6]]
+; CHECK-NEXT:    [[ARRAYIDX34_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_6]]
+; CHECK-NEXT:    [[ARRAYIDX36_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_7]]
+; CHECK-NEXT:    [[ARRAYIDX34_8:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_7]]
+; CHECK-NEXT:    [[ARRAYIDX36_8:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_8]]
+; CHECK-NEXT:    [[ARRAYIDX34_9:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_8]]
+; CHECK-NEXT:    [[ARRAYIDX36_9:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_9]]
+; CHECK-NEXT:    [[ARRAYIDX34_10:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_9]]
+; CHECK-NEXT:    [[ARRAYIDX36_10:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_10]]
+; CHECK-NEXT:    [[ARRAYIDX34_11:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_10]]
+; CHECK-NEXT:    [[ARRAYIDX36_11:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_11]]
+; CHECK-NEXT:    [[ARRAYIDX34_12:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_11]]
+; CHECK-NEXT:    [[ARRAYIDX36_12:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_12]]
+; CHECK-NEXT:    [[ARRAYIDX34_13:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_12]]
+; CHECK-NEXT:    [[ARRAYIDX36_13:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_13]]
+; CHECK-NEXT:    [[ARRAYIDX34_14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_13]]
+; CHECK-NEXT:    [[ARRAYIDX36_14:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_14]]
+; CHECK-NEXT:    [[ARRAYIDX34_15:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_14]]
+; CHECK-NEXT:    [[ARRAYIDX36_15:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX34_1]], align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX36_1]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[ARRAYIDX34_2]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[ARRAYIDX36_2]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[ARRAYIDX34_3]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr [[ARRAYIDX36_3]], align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = load float, ptr [[ARRAYIDX34_4]], align 4
+; CHECK-NEXT:    [[TMP41:%.*]] = load float, ptr [[ARRAYIDX36_4]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[ARRAYIDX34_5]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = load float, ptr [[ARRAYIDX36_5]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load float, ptr [[ARRAYIDX34_6]], align 4
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[ARRAYIDX36_6]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, ptr [[ARRAYIDX34_7]], align 4
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[ARRAYIDX36_7]], align 4
+; CHECK-NEXT:    [[TMP48:%.*]] = load float, ptr [[ARRAYIDX34_8]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[ARRAYIDX36_8]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load float, ptr [[ARRAYIDX34_9]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[ARRAYIDX36_9]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[ARRAYIDX34_10]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[ARRAYIDX36_10]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load float, ptr [[ARRAYIDX34_11]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load float, ptr [[ARRAYIDX36_11]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = load float, ptr [[ARRAYIDX34_12]], align 4
+; CHECK-NEXT:    [[TMP57:%.*]] = load float, ptr [[ARRAYIDX36_12]], align 4
+; CHECK-NEXT:    [[TMP58:%.*]] = load float, ptr [[ARRAYIDX34_13]], align 4
+; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[ARRAYIDX36_13]], align 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load float, ptr [[ARRAYIDX34_14]], align 4
+; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[ARRAYIDX36_14]], align 4
+; CHECK-NEXT:    [[TMP62:%.*]] = load float, ptr [[ARRAYIDX34_15]], align 4
+; CHECK-NEXT:    [[TMP63:%.*]] = load float, ptr [[ARRAYIDX36_15]], align 4
+; CHECK-NEXT:    [[TMP64]] = tail call float @llvm.fmuladd.f32(float [[TMP32]], float [[TMP33]], float [[DOTPHI9]])
+; CHECK-NEXT:    [[TMP65]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[DOTPHI10]])
+; CHECK-NEXT:    [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP36]], float [[TMP37]], float [[DOTPHI11]])
+; CHECK-NEXT:    [[TMP67]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[DOTPHI12]])
+; CHECK-NEXT:    [[TMP68]] = tail call float @llvm.fmuladd.f32(float [[TMP40]], float [[TMP41]], float [[DOTPHI13]])
+; CHECK-NEXT:    [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP42]], float [[TMP43]], float [[DOTPHI14]])
+; CHECK-NEXT:    [[TMP70]] = tail call float @llvm.fmuladd.f32(float [[TMP44]], float [[TMP45]], float [[DOTPHI15]])
+; CHECK-NEXT:    [[TMP71]] = tail call float @llvm.fmuladd.f32(float [[TMP46]], float [[TMP47]], float [[DOTPHI16]])
+; CHECK-NEXT:    [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP48]], float [[TMP49]], float [[DOTPHI17]])
+; CHECK-NEXT:    [[TMP73]] = tail call float @llvm.fmuladd.f32(float [[TMP50]], float [[TMP51]], float [[DOTPHI18]])
+; CHECK-NEXT:    [[TMP74]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP53]], float [[DOTPHI19]])
+; CHECK-NEXT:    [[TMP75]] = tail call float @llvm.fmuladd.f32(float [[TMP54]], float [[TMP55]], float [[DOTPHI20]])
+; CHECK-NEXT:    [[TMP76]] = tail call float @llvm.fmuladd.f32(float [[TMP56]], float [[TMP57]], float [[DOTPHI21]])
+; CHECK-NEXT:    [[TMP77]] = tail call float @llvm.fmuladd.f32(float [[TMP58]], float [[TMP59]], float [[DOTPHI22]])
+; CHECK-NEXT:    [[TMP78]] = tail call float @llvm.fmuladd.f32(float [[TMP60]], float [[TMP61]], float [[DOTPHI23]])
+; CHECK-NEXT:    [[TMP79]] = tail call float @llvm.fmuladd.f32(float [[TMP62]], float [[TMP63]], float [[DOTPHI24]])
+; CHECK-NEXT:    [[CMP32_NOT_15:%.*]] = icmp ult i32 [[INC39_15]], [[ADD60]]
+; CHECK-NEXT:    br i1 [[CMP32_NOT_15]], label [[FOR_BODY33_15]], label [[FOR_END40_LOOPEXIT:%.*]]
+; CHECK:       for.end40.loopexit:
+; CHECK-NEXT:    [[SUM45:%.*]] = fadd float [[TMP64]], [[TMP65]]
+; CHECK-NEXT:    [[SUM46:%.*]] = fadd float [[TMP66]], [[TMP67]]
+; CHECK-NEXT:    [[SUM47:%.*]] = fadd float [[TMP68]], [[TMP69]]
+; CHECK-NEXT:    [[SUM48:%.*]] = fadd float [[TMP70]], [[TMP71]]
+; CHECK-NEXT:    [[SUM49:%.*]] = fadd float [[TMP72]], [[TMP73]]
+; CHECK-NEXT:    [[SUM50:%.*]] = fadd float [[TMP74]], [[TMP75]]
+; CHECK-NEXT:    [[SUM51:%.*]] = fadd float [[TMP76]], [[TMP77]]
+; CHECK-NEXT:    [[SUM52:%.*]] = fadd float [[TMP78]], [[TMP79]]
+; CHECK-NEXT:    [[SUM53:%.*]] = fadd float [[SUM45]], [[SUM46]]
+; CHECK-NEXT:    [[SUM54:%.*]] = fadd float [[SUM47]], [[SUM48]]
+; CHECK-NEXT:    [[SUM55:%.*]] = fadd float [[SUM49]], [[SUM50]]
+; CHECK-NEXT:    [[SUM56:%.*]] = fadd float [[SUM51]], [[SUM52]]
+; CHECK-NEXT:    [[SUM57:%.*]] = fadd float [[SUM53]], [[SUM54]]
+; CHECK-NEXT:    [[SUM58:%.*]] = fadd float [[SUM55]], [[SUM56]]
+; CHECK-NEXT:    [[SUM59:%.*]] = fadd float [[SUM57]], [[SUM58]]
+; CHECK-NEXT:    br label [[FOR_END164]]
+; CHECK:       for.end164:
+; CHECK-NEXT:    [[PHI_SUM:%.*]] = phi i32 [ [[ADD30]], [[FOR_BODY25]] ], [ [[INC39_15]], [[FOR_END40_LOOPEXIT]] ]
+; CHECK-NEXT:    [[PHI_FLOAT:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY25]] ], [ [[SUM59]], [[FOR_END40_LOOPEXIT]] ]
+; CHECK-NEXT:    store float [[PHI_FLOAT]], ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[CMP182_NOT587:%.*]] = icmp ugt i32 [[PHI_SUM]], [[N21_0129]]
+; CHECK-NEXT:    br i1 [[CMP182_NOT587]], label [[FOR_END40]], label [[FOR_BODY33_CLONE:%.*]]
+; CHECK:       for.body33.clone:
+; CHECK-NEXT:    [[TMP80:%.*]] = phi float [ [[TMP83:%.*]], [[FOR_BODY33_CLONE]] ], [ [[PHI_FLOAT]], [[FOR_END164]] ]
+; CHECK-NEXT:    [[K27_0127_CLONE:%.*]] = phi i32 [ [[INC39_CLONE:%.*]], [[FOR_BODY33_CLONE]] ], [ [[PHI_SUM]], [[FOR_END164]] ]
+; CHECK-NEXT:    [[ARRAYIDX34_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K27_0127_CLONE]]
+; CHECK-NEXT:    [[TMP81:%.*]] = load float, ptr [[ARRAYIDX34_CLONE]], align 4
+; CHECK-NEXT:    [[SUB35_CLONE:%.*]] = sub i32 [[K27_0127_CLONE]], [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX36_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_CLONE]]
+; CHECK-NEXT:    [[TMP82:%.*]] = load float, ptr [[ARRAYIDX36_CLONE]], align 4
+; CHECK-NEXT:    [[TMP83]] = tail call float @llvm.fmuladd.f32(float [[TMP81]], float [[TMP82]], float [[TMP80]])
+; CHECK-NEXT:    [[INC39_CLONE]] = add i32 [[K27_0127_CLONE]], 1
+; CHECK-NEXT:    [[CMP32_NOT_CLONE:%.*]] = icmp ugt i32 [[INC39_CLONE]], [[N21_0129]]
+; CHECK-NEXT:    br i1 [[CMP32_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE25:%.*]], label [[FOR_BODY33_CLONE]]
+; CHECK:       for.cond.for.end_crit_edge25:
+; CHECK-NEXT:    store float [[TMP83]], ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    br label [[FOR_END40]]
 ; CHECK:       for.end40:
 ; CHECK-NEXT:    [[INC42]] = add nuw nsw i32 [[N21_0129]], 1
 ; CHECK-NEXT:    [[EXITCOND135_NOT:%.*]] = icmp eq i32 [[INC42]], [[LSIG_0]]
-; CHECK-NEXT:    br i1 [[EXITCOND135_NOT]], label [[FOR_COND45_PREHEADER]], label [[FOR_BODY25]]
+; CHECK-NEXT:    br i1 [[EXITCOND135_NOT]], label [[FOR_COND45_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY25]]
 ; CHECK:       for.body50:
 ; CHECK-NEXT:    [[N44_0133:%.*]] = phi i32 [ [[LSIG_0]], [[FOR_BODY50_LR_PH]] ], [ [[INC69:%.*]], [[FOR_END67:%.*]] ]
 ; CHECK-NEXT:    [[ARRAYIDX54:%.*]] = getelementptr inbounds float, ptr [[CORRVOUT]], i32 [[N44_0133]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX54]], align 4
 ; CHECK-NEXT:    [[SUB55:%.*]] = sub nsw i32 [[N44_0133]], [[LKERN_0]]
 ; CHECK-NEXT:    [[ADD56:%.*]] = add nsw i32 [[SUB55]], 1
-; CHECK-NEXT:    [[CMP59_NOT130:%.*]] = icmp ugt i32 [[ADD56]], [[SUB57]]
-; CHECK-NEXT:    br i1 [[CMP59_NOT130]], label [[FOR_END67]], label [[FOR_BODY60:%.*]]
-; CHECK:       for.body60:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi float [ [[TMP12:%.*]], [[FOR_BODY60]] ], [ 0.000000e+00, [[FOR_BODY50]] ]
-; CHECK-NEXT:    [[K53_0131:%.*]] = phi i32 [ [[INC66:%.*]], [[FOR_BODY60]] ], [ [[ADD56]], [[FOR_BODY50]] ]
-; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K53_0131]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX61]], align 4
+; CHECK-NEXT:    [[ADD207_NEG:%.*]] = xor i32 [[SUB55]], -1
+; CHECK-NEXT:    [[ADD211:%.*]] = add i32 [[ADD207_NEG]], [[LSIG_0]]
+; CHECK-NEXT:    [[DIV212535:%.*]] = and i32 [[ADD211]], -8
+; CHECK-NEXT:    [[ADD214:%.*]] = add i32 [[DIV212535]], [[ADD56]]
+; CHECK-NEXT:    [[CMP59_NOT130:%.*]] = icmp ult i32 [[ADD56]], [[ADD214]]
+; CHECK-NEXT:    br i1 [[CMP59_NOT130]], label [[FOR_BODY60_PREHEADER:%.*]], label [[FOR_END16434:%.*]]
+; CHECK:       for.body60.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY60_7:%.*]]
+; CHECK:       for.body60.7:
+; CHECK-NEXT:    [[K53_0131:%.*]] = phi i32 [ [[ADD56]], [[FOR_BODY60_PREHEADER]] ], [ [[INC66_7:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI26:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP100:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI27:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP101:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI28:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP102:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI29:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP103:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI30:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP104:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI31:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP105:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI32:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP106:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[DOTPHI33:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP107:%.*]], [[FOR_BODY60_7]] ]
+; CHECK-NEXT:    [[INC66:%.*]] = add i32 [[K53_0131]], 1
+; CHECK-NEXT:    [[INC66_1:%.*]] = add i32 [[K53_0131]], 2
+; CHECK-NEXT:    [[INC66_2:%.*]] = add i32 [[K53_0131]], 3
+; CHECK-NEXT:    [[INC66_3:%.*]] = add i32 [[K53_0131]], 4
+; CHECK-NEXT:    [[INC66_4:%.*]] = add i32 [[K53_0131]], 5
+; CHECK-NEXT:    [[INC66_5:%.*]] = add i32 [[K53_0131]], 6
+; CHECK-NEXT:    [[INC66_6:%.*]] = add i32 [[K53_0131]], 7
+; CHECK-NEXT:    [[INC66_7]] = add i32 [[K53_0131]], 8
 ; CHECK-NEXT:    [[SUB62:%.*]] = sub i32 [[K53_0131]], [[ADD56]]
+; CHECK-NEXT:    [[SUB62_1:%.*]] = sub i32 [[INC66]], [[ADD56]]
+; CHECK-NEXT:    [[SUB62_2:%.*]] = sub i32 [[INC66_1]], [[ADD56]]
+; CHECK-NEXT:    [[SUB62_3:%.*]] = sub i32 [[INC66_2]], [[ADD56]]
+; CHECK-NEXT:    [[SUB62_4:%.*]] = sub i32 [[INC66_3]], [[ADD56]]
+; CHECK-NEXT:    [[SUB62_5:%.*]] = sub i32 [[INC66_4]], [[ADD56]]
+; CHECK-NEXT:    [[SUB62_6:%.*]] = sub i32 [[INC66_5]], [[ADD56]]
+; CHECK-NEXT:    [[SUB62_7:%.*]] = sub i32 [[INC66_6]], [[ADD56]]
+; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K53_0131]]
 ; CHECK-NEXT:    [[ARRAYIDX63:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX63]], align 4
-; CHECK-NEXT:    [[TMP12]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[TMP9]])
-; CHECK-NEXT:    store float [[TMP12]], ptr [[ARRAYIDX54]], align 4
-; CHECK-NEXT:    [[INC66]] = add i32 [[K53_0131]], 1
-; CHECK-NEXT:    [[CMP59_NOT:%.*]] = icmp ugt i32 [[INC66]], [[SUB57]]
-; CHECK-NEXT:    br i1 [[CMP59_NOT]], label [[FOR_END67]], label [[FOR_BODY60]]
+; CHECK-NEXT:    [[ARRAYIDX61_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66]]
+; CHECK-NEXT:    [[ARRAYIDX63_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_1]]
+; CHECK-NEXT:    [[ARRAYIDX61_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_1]]
+; CHECK-NEXT:    [[ARRAYIDX63_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_2]]
+; CHECK-NEXT:    [[ARRAYIDX61_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_2]]
+; CHECK-NEXT:    [[ARRAYIDX63_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_3]]
+; CHECK-NEXT:    [[ARRAYIDX61_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_3]]
+; CHECK-NEXT:    [[ARRAYIDX63_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_4]]
+; CHECK-NEXT:    [[ARRAYIDX61_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_4]]
+; CHECK-NEXT:    [[ARRAYIDX63_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_5]]
+; CHECK-NEXT:    [[ARRAYIDX61_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_5]]
+; CHECK-NEXT:    [[ARRAYIDX63_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_6]]
+; CHECK-NEXT:    [[ARRAYIDX61_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_6]]
+; CHECK-NEXT:    [[ARRAYIDX63_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_7]]
+; CHECK-NEXT:    [[TMP84:%.*]] = load float, ptr [[ARRAYIDX61]], align 4
+; CHECK-NEXT:    [[TMP85:%.*]] = load float, ptr [[ARRAYIDX63]], align 4
+; CHECK-NEXT:    [[TMP86:%.*]] = load float, ptr [[ARRAYIDX61_1]], align 4
+; CHECK-NEXT:    [[TMP87:%.*]] = load float, ptr [[ARRAYIDX63_1]], align 4
+; CHECK-NEXT:    [[TMP88:%.*]] = load float, ptr [[ARRAYIDX61_2]], align 4
+; CHECK-NEXT:    [[TMP89:%.*]] = load float, ptr [[ARRAYIDX63_2]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = load float, ptr [[ARRAYIDX61_3]], align 4
+; CHECK-NEXT:    [[TMP91:%.*]] = load float, ptr [[ARRAYIDX63_3]], align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = load float, ptr [[ARRAYIDX61_4]], align 4
+; CHECK-NEXT:    [[TMP93:%.*]] = load float, ptr [[ARRAYIDX63_4]], align 4
+; CHECK-NEXT:    [[TMP94:%.*]] = load float, ptr [[ARRAYIDX61_5]], align 4
+; CHECK-NEXT:    [[TMP95:%.*]] = load float, ptr [[ARRAYIDX63_5]], align 4
+; CHECK-NEXT:    [[TMP96:%.*]] = load float, ptr [[ARRAYIDX61_6]], align 4
+; CHECK-NEXT:    [[TMP97:%.*]] = load float, ptr [[ARRAYIDX63_6]], align 4
+; CHECK-NEXT:    [[TMP98:%.*]] = load float, ptr [[ARRAYIDX61_7]], align 4
+; CHECK-NEXT:    [[TMP99:%.*]] = load float, ptr [[ARRAYIDX63_7]], align 4
+; CHECK-NEXT:    [[TMP100]] = tail call float @llvm.fmuladd.f32(float [[TMP84]], float [[TMP85]], float [[DOTPHI26]])
+; CHECK-NEXT:    [[TMP101]] = tail call float @llvm.fmuladd.f32(float [[TMP86]], float [[TMP87]], float [[DOTPHI27]])
+; CHECK-NEXT:    [[TMP102]] = tail call float @llvm.fmuladd.f32(float [[TMP88]], float [[TMP89]], float [[DOTPHI28]])
+; CHECK-NEXT:    [[TMP103]] = tail call float @llvm.fmuladd.f32(float [[TMP90]], float [[TMP91]], float [[DOTPHI29]])
+; CHECK-NEXT:    [[TMP104]] = tail call float @llvm.fmuladd.f32(float [[TMP92]], float [[TMP93]], float [[DOTPHI30]])
+; CHECK-NEXT:    [[TMP105]] = tail call float @llvm.fmuladd.f32(float [[TMP94]], float [[TMP95]], float [[DOTPHI31]])
+; CHECK-NEXT:    [[TMP106]] = tail call float @llvm.fmuladd.f32(float [[TMP96]], float [[TMP97]], float [[DOTPHI32]])
+; CHECK-NEXT:    [[TMP107]] = tail call float @llvm.fmuladd.f32(float [[TMP98]], float [[TMP99]], float [[DOTPHI33]])
+; CHECK-NEXT:    [[CMP59_NOT_7:%.*]] = icmp ult i32 [[INC66_7]], [[ADD214]]
+; CHECK-NEXT:    br i1 [[CMP59_NOT_7]], label [[FOR_BODY60_7]], label [[FOR_END67_LOOPEXIT:%.*]]
+; CHECK:       for.end67.loopexit:
+; CHECK-NEXT:    [[SUM60:%.*]] = fadd float [[TMP100]], [[TMP101]]
+; CHECK-NEXT:    [[SUM61:%.*]] = fadd float [[TMP102]], [[TMP103]]
+; CHECK-NEXT:    [[SUM62:%.*]] = fadd float [[TMP104]], [[TMP105]]
+; CHECK-NEXT:    [[SUM63:%.*]] = fadd float [[TMP106]], [[TMP107]]
+; CHECK-NEXT:    [[SUM64:%.*]] = fadd float [[SUM60]], [[SUM61]]
+; CHECK-NEXT:    [[SUM65:%.*]] = fadd float [[SUM62]], [[SUM63]]
+; CHECK-NEXT:    [[SUM66:%.*]] = fadd float [[SUM64]], [[SUM65]]
+; CHECK-NEXT:    br label [[FOR_END16434]]
+; CHECK:       for.end16434:
+; CHECK-NEXT:    [[PHI_SUM35:%.*]] = phi i32 [ [[ADD56]], [[FOR_BODY50]] ], [ [[INC66_7]], [[FOR_END67_LOOPEXIT]] ]
+; CHECK-NEXT:    [[PHI_FLOAT36:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY50]] ], [ [[SUM66]], [[FOR_END67_LOOPEXIT]] ]
+; CHECK-NEXT:    store float [[PHI_FLOAT36]], ptr [[ARRAYIDX54]], align 4
+; CHECK-NEXT:    [[CMP182_NOT58737:%.*]] = icmp ugt i32 [[PHI_SUM35]], [[SUB57]]
+; CHECK-NEXT:    br i1 [[CMP182_NOT58737]], label [[FOR_END67]], label [[FOR_BODY60_CLONE:%.*]]
+; CHECK:       for.body60.clone:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP111:%.*]], [[FOR_BODY60_CLONE]] ], [ [[PHI_FLOAT36]], [[FOR_END16434]] ]
+; CHECK-NEXT:    [[K53_0131_CLONE:%.*]] = phi i32 [ [[INC66_CLONE:%.*]], [[FOR_BODY60_CLONE]] ], [ [[PHI_SUM35]], [[FOR_END16434]] ]
+; CHECK-NEXT:    [[ARRAYIDX61_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K53_0131_CLONE]]
+; CHECK-NEXT:    [[TMP109:%.*]] = load float, ptr [[ARRAYIDX61_CLONE]], align 4
+; CHECK-NEXT:    [[SUB62_CLONE:%.*]] = sub i32 [[K53_0131_CLONE]], [[ADD56]]
+; CHECK-NEXT:    [[ARRAYIDX63_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_CLONE]]
+; CHECK-NEXT:    [[TMP110:%.*]] = load float, ptr [[ARRAYIDX63_CLONE]], align 4
+; CHECK-NEXT:    [[TMP111]] = tail call float @llvm.fmuladd.f32(float [[TMP109]], float [[TMP110]], float [[TMP108]])
+; CHECK-NEXT:    [[INC66_CLONE]] = add i32 [[K53_0131_CLONE]], 1
+; CHECK-NEXT:    [[CMP59_NOT_CLONE:%.*]] = icmp ugt i32 [[INC66_CLONE]], [[SUB57]]
+; CHECK-NEXT:    br i1 [[CMP59_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE38:%.*]], label [[FOR_BODY60_CLONE]]
+; CHECK:       for.cond.for.end_crit_edge38:
+; CHECK-NEXT:    store float [[TMP111]], ptr [[ARRAYIDX54]], align 4
+; CHECK-NEXT:    br label [[FOR_END67]]
 ; CHECK:       for.end67:
 ; CHECK-NEXT:    [[INC69]] = add nsw i32 [[N44_0133]], 1
 ; CHECK-NEXT:    [[EXITCOND136_NOT:%.*]] = icmp eq i32 [[INC69]], [[SUB47]]
-; CHECK-NEXT:    br i1 [[EXITCOND136_NOT]], label [[RETURN]], label [[FOR_BODY50]]
+; CHECK-NEXT:    br i1 [[EXITCOND136_NOT]], label [[RETURN_LOOPEXIT:%.*]], label [[FOR_BODY50]]
+; CHECK:       return.loopexit:
+; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND45_PREHEADER]] ], [ 0, [[FOR_END67]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND45_PREHEADER]] ], [ 0, [[RETURN_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll
index 33a08dfbf9df1c..86f9a334884556 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_conv_f32_ansi(ptr noundef readonly %Signal, i32 noundef %siglen, ptr noundef readonly %Kernel, i32 noundef %kernlen, ptr noundef writeonly %convout) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_conv_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noundef writeonly [[CONVOUT:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noalias noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noalias noundef writeonly [[CONVOUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[SIGNAL]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[KERNEL]], null
@@ -21,34 +21,129 @@ define dso_local noundef i32 @dsps_conv_f32_ansi(ptr noundef readonly %Signal, i
 ; CHECK-NEXT:    [[KERN_0:%.*]] = phi ptr [ [[SIGNAL]], [[IF_THEN8]] ], [ [[KERNEL]], [[IF_END6]] ]
 ; CHECK-NEXT:    [[SIG_0:%.*]] = phi ptr [ [[KERNEL]], [[IF_THEN8]] ], [ [[SIGNAL]], [[IF_END6]] ]
 ; CHECK-NEXT:    [[CMP10120:%.*]] = icmp sgt i32 [[LKERN_0]], 0
-; CHECK-NEXT:    br i1 [[CMP10120]], label [[FOR_BODY:%.*]], label [[FOR_COND21_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP10120]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND21_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond21.preheader.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND21_PREHEADER]]
 ; CHECK:       for.cond21.preheader:
 ; CHECK-NEXT:    [[CMP22125:%.*]] = icmp slt i32 [[LKERN_0]], [[LSIG_0]]
-; CHECK-NEXT:    br i1 [[CMP22125]], label [[FOR_BODY24:%.*]], label [[FOR_COND42_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP22125]], label [[FOR_BODY24_PREHEADER:%.*]], label [[FOR_COND42_PREHEADER:%.*]]
+; CHECK:       for.body24.preheader:
+; CHECK-NEXT:    [[DIV536:%.*]] = and i32 [[LKERN_0]], -16
+; CHECK-NEXT:    br label [[FOR_BODY24:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[IF_END9]] ]
-; CHECK-NEXT:    [[N_0121:%.*]] = phi i32 [ [[INC18:%.*]], [[FOR_END]] ], [ 0, [[IF_END9]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_0121:%.*]] = phi i32 [ [[INC18:%.*]], [[FOR_END]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = and i32 [[N_0121]], -8
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[N_0121]], 2147483640
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[CONVOUT]], i32 [[N_0121]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    br label [[FOR_BODY13:%.*]]
-; CHECK:       for.body13:
-; CHECK-NEXT:    [[K_0119:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY13]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP3:%.*]], [[FOR_BODY13]] ]
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0119]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY13_CLONE_PREHEADER:%.*]], label [[FOR_BODY13_7:%.*]]
+; CHECK:       for.body13.7:
+; CHECK-NEXT:    [[K_0119:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC_7:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP19:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI1:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP20:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI2:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI3:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP22:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP23:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI5:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[DOTPHI7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP26:%.*]], [[FOR_BODY13_7]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[K_0119]], 1
+; CHECK-NEXT:    [[INC_1:%.*]] = add nuw nsw i32 [[K_0119]], 2
+; CHECK-NEXT:    [[INC_2:%.*]] = add nuw nsw i32 [[K_0119]], 3
+; CHECK-NEXT:    [[INC_3:%.*]] = add nuw nsw i32 [[K_0119]], 4
+; CHECK-NEXT:    [[INC_4:%.*]] = add nuw nsw i32 [[K_0119]], 5
+; CHECK-NEXT:    [[INC_5:%.*]] = add nuw nsw i32 [[K_0119]], 6
+; CHECK-NEXT:    [[INC_6:%.*]] = add nuw nsw i32 [[K_0119]], 7
+; CHECK-NEXT:    [[INC_7]] = add nuw nsw i32 [[K_0119]], 8
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[N_0121]], [[K_0119]]
+; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i32 [[N_0121]], [[INC]]
+; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i32 [[N_0121]], [[INC_1]]
+; CHECK-NEXT:    [[SUB_3:%.*]] = sub nsw i32 [[N_0121]], [[INC_2]]
+; CHECK-NEXT:    [[SUB_4:%.*]] = sub nsw i32 [[N_0121]], [[INC_3]]
+; CHECK-NEXT:    [[SUB_5:%.*]] = sub nsw i32 [[N_0121]], [[INC_4]]
+; CHECK-NEXT:    [[SUB_6:%.*]] = sub nsw i32 [[N_0121]], [[INC_5]]
+; CHECK-NEXT:    [[SUB_7:%.*]] = sub nsw i32 [[N_0121]], [[INC_6]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0119]]
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP3]] = tail call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP2]], float [[TMP0]])
-; CHECK-NEXT:    store float [[TMP3]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[K_0119]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY13]]
+; CHECK-NEXT:    [[ARRAYIDX14_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC]]
+; CHECK-NEXT:    [[ARRAYIDX15_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_1]]
+; CHECK-NEXT:    [[ARRAYIDX14_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_1]]
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_2]]
+; CHECK-NEXT:    [[ARRAYIDX14_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_2]]
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_3]]
+; CHECK-NEXT:    [[ARRAYIDX14_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_3]]
+; CHECK-NEXT:    [[ARRAYIDX15_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_4]]
+; CHECK-NEXT:    [[ARRAYIDX14_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_4]]
+; CHECK-NEXT:    [[ARRAYIDX15_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_5]]
+; CHECK-NEXT:    [[ARRAYIDX14_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_5]]
+; CHECK-NEXT:    [[ARRAYIDX15_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_6]]
+; CHECK-NEXT:    [[ARRAYIDX14_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_6]]
+; CHECK-NEXT:    [[ARRAYIDX15_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX14_1]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX15_1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14_2]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX15_2]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14_3]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15_3]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX14_4]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX15_4]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX14_5]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX15_5]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX14_6]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX15_6]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX14_7]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX15_7]], align 4
+; CHECK-NEXT:    [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP3]], float [[TMP4]], float [[DOTPHI]])
+; CHECK-NEXT:    [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[DOTPHI1]])
+; CHECK-NEXT:    [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP7]], float [[TMP8]], float [[DOTPHI2]])
+; CHECK-NEXT:    [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP10]], float [[DOTPHI3]])
+; CHECK-NEXT:    [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[DOTPHI4]])
+; CHECK-NEXT:    [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP13]], float [[TMP14]], float [[DOTPHI5]])
+; CHECK-NEXT:    [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[DOTPHI6]])
+; CHECK-NEXT:    [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP17]], float [[TMP18]], float [[DOTPHI7]])
+; CHECK-NEXT:    [[EXITCOND_7:%.*]] = icmp ult i32 [[INC_7]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND_7]], label [[FOR_BODY13_7]], label [[FOR_END8:%.*]]
+; CHECK:       for.end8:
+; CHECK-NEXT:    [[SUM:%.*]] = fadd float [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[SUM23:%.*]] = fadd float [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[SUM24:%.*]] = fadd float [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[SUM25:%.*]] = fadd float [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[SUM26:%.*]] = fadd float [[SUM]], [[SUM23]]
+; CHECK-NEXT:    [[SUM27:%.*]] = fadd float [[SUM24]], [[SUM25]]
+; CHECK-NEXT:    [[SUM28:%.*]] = fadd float [[SUM26]], [[SUM27]]
+; CHECK-NEXT:    store float [[SUM28]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br i1 false, label [[FOR_END]], label [[FOR_BODY13_CLONE_PREHEADER]]
+; CHECK:       for.body13.clone.preheader:
+; CHECK-NEXT:    [[SUM_PHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[SUM28]], [[FOR_END8]] ]
+; CHECK-NEXT:    [[ADD_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP0]], [[FOR_END8]] ]
+; CHECK-NEXT:    br label [[FOR_BODY13_CLONE:%.*]]
+; CHECK:       for.body13.clone:
+; CHECK-NEXT:    [[K_0119_CLONE:%.*]] = phi i32 [ [[ADD_PHI]], [[FOR_BODY13_CLONE_PREHEADER]] ], [ [[INC_CLONE:%.*]], [[FOR_BODY13_CLONE]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi float [ [[SUM_PHI]], [[FOR_BODY13_CLONE_PREHEADER]] ], [ [[TMP30:%.*]], [[FOR_BODY13_CLONE]] ]
+; CHECK-NEXT:    [[ARRAYIDX14_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0119_CLONE]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load float, ptr [[ARRAYIDX14_CLONE]], align 4
+; CHECK-NEXT:    [[SUB_CLONE:%.*]] = sub nsw i32 [[N_0121]], [[K_0119_CLONE]]
+; CHECK-NEXT:    [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_CLONE]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4
+; CHECK-NEXT:    [[TMP30]] = tail call float @llvm.fmuladd.f32(float [[TMP28]], float [[TMP29]], float [[TMP27]])
+; CHECK-NEXT:    [[INC_CLONE]] = add nuw nsw i32 [[K_0119_CLONE]], 1
+; CHECK-NEXT:    [[EXITCOND_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[INDVARS_IV]]
+; CHECK-NEXT:    br i1 [[EXITCOND_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY13_CLONE]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    store float [[TMP30]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[INC18]] = add nuw nsw i32 [[N_0121]], 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i32 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND132_NOT:%.*]] = icmp eq i32 [[INC18]], [[LKERN_0]]
-; CHECK-NEXT:    br i1 [[EXITCOND132_NOT]], label [[FOR_COND21_PREHEADER]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND132_NOT]], label [[FOR_COND21_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond42.preheader.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND42_PREHEADER]]
 ; CHECK:       for.cond42.preheader:
 ; CHECK-NEXT:    [[ADD43:%.*]] = add i32 [[SIGLEN]], -1
 ; CHECK-NEXT:    [[SUB44:%.*]] = add i32 [[ADD43]], [[KERNLEN]]
@@ -58,57 +153,308 @@ define dso_local noundef i32 @dsps_conv_f32_ansi(ptr noundef readonly %Signal, i
 ; CHECK-NEXT:    [[SUB54:%.*]] = add nsw i32 [[LSIG_0]], -1
 ; CHECK-NEXT:    br label [[FOR_BODY47:%.*]]
 ; CHECK:       for.body24:
-; CHECK-NEXT:    [[N20_0126:%.*]] = phi i32 [ [[INC39:%.*]], [[FOR_END37:%.*]] ], [ [[LKERN_0]], [[FOR_COND21_PREHEADER]] ]
+; CHECK-NEXT:    [[N20_0126:%.*]] = phi i32 [ [[INC39:%.*]], [[FOR_END37:%.*]] ], [ [[LKERN_0]], [[FOR_BODY24_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[CONVOUT]], i32 [[N20_0126]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX26]], align 4
 ; CHECK-NEXT:    [[SUB27:%.*]] = sub nuw nsw i32 [[N20_0126]], [[LKERN_0]]
 ; CHECK-NEXT:    [[K25_0122:%.*]] = add i32 [[SUB27]], 1
-; CHECK-NEXT:    [[CMP29_NOT123:%.*]] = icmp ugt i32 [[K25_0122]], [[N20_0126]]
-; CHECK-NEXT:    br i1 [[CMP29_NOT123]], label [[FOR_END37]], label [[FOR_BODY30:%.*]]
-; CHECK:       for.body30:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP7:%.*]], [[FOR_BODY30]] ], [ 0.000000e+00, [[FOR_BODY24]] ]
-; CHECK-NEXT:    [[K25_0124:%.*]] = phi i32 [ [[K25_0:%.*]], [[FOR_BODY30]] ], [ [[K25_0122]], [[FOR_BODY24]] ]
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0124]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[ADD60:%.*]] = add i32 [[K25_0122]], [[DIV536]]
+; CHECK-NEXT:    [[CMP29_NOT123:%.*]] = icmp ult i32 [[K25_0122]], [[ADD60]]
+; CHECK-NEXT:    br i1 [[CMP29_NOT123]], label [[FOR_BODY30_PREHEADER:%.*]], label [[FOR_END164:%.*]]
+; CHECK:       for.body30.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY30_15:%.*]]
+; CHECK:       for.body30.15:
+; CHECK-NEXT:    [[K25_0124:%.*]] = phi i32 [ [[K25_0122]], [[FOR_BODY30_PREHEADER]] ], [ [[K25_0_15:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP63:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI10:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP64:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI11:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP65:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI12:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP66:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI13:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP67:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI14:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP68:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI15:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP69:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI16:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP70:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI17:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP71:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI18:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP72:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI19:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP73:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI20:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP74:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI21:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP75:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI22:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP76:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI23:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP77:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[DOTPHI24:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP78:%.*]], [[FOR_BODY30_15]] ]
+; CHECK-NEXT:    [[K25_0:%.*]] = add i32 [[K25_0124]], 1
+; CHECK-NEXT:    [[K25_0_1:%.*]] = add i32 [[K25_0124]], 2
+; CHECK-NEXT:    [[K25_0_2:%.*]] = add i32 [[K25_0124]], 3
+; CHECK-NEXT:    [[K25_0_3:%.*]] = add i32 [[K25_0124]], 4
+; CHECK-NEXT:    [[K25_0_4:%.*]] = add i32 [[K25_0124]], 5
+; CHECK-NEXT:    [[K25_0_5:%.*]] = add i32 [[K25_0124]], 6
+; CHECK-NEXT:    [[K25_0_6:%.*]] = add i32 [[K25_0124]], 7
+; CHECK-NEXT:    [[K25_0_7:%.*]] = add i32 [[K25_0124]], 8
+; CHECK-NEXT:    [[K25_0_8:%.*]] = add i32 [[K25_0124]], 9
+; CHECK-NEXT:    [[K25_0_9:%.*]] = add i32 [[K25_0124]], 10
+; CHECK-NEXT:    [[K25_0_10:%.*]] = add i32 [[K25_0124]], 11
+; CHECK-NEXT:    [[K25_0_11:%.*]] = add i32 [[K25_0124]], 12
+; CHECK-NEXT:    [[K25_0_12:%.*]] = add i32 [[K25_0124]], 13
+; CHECK-NEXT:    [[K25_0_13:%.*]] = add i32 [[K25_0124]], 14
+; CHECK-NEXT:    [[K25_0_14:%.*]] = add i32 [[K25_0124]], 15
+; CHECK-NEXT:    [[K25_0_15]] = add i32 [[K25_0124]], 16
 ; CHECK-NEXT:    [[SUB32:%.*]] = sub i32 [[N20_0126]], [[K25_0124]]
+; CHECK-NEXT:    [[SUB32_1:%.*]] = sub i32 [[N20_0126]], [[K25_0]]
+; CHECK-NEXT:    [[SUB32_2:%.*]] = sub i32 [[N20_0126]], [[K25_0_1]]
+; CHECK-NEXT:    [[SUB32_3:%.*]] = sub i32 [[N20_0126]], [[K25_0_2]]
+; CHECK-NEXT:    [[SUB32_4:%.*]] = sub i32 [[N20_0126]], [[K25_0_3]]
+; CHECK-NEXT:    [[SUB32_5:%.*]] = sub i32 [[N20_0126]], [[K25_0_4]]
+; CHECK-NEXT:    [[SUB32_6:%.*]] = sub i32 [[N20_0126]], [[K25_0_5]]
+; CHECK-NEXT:    [[SUB32_7:%.*]] = sub i32 [[N20_0126]], [[K25_0_6]]
+; CHECK-NEXT:    [[SUB32_8:%.*]] = sub i32 [[N20_0126]], [[K25_0_7]]
+; CHECK-NEXT:    [[SUB32_9:%.*]] = sub i32 [[N20_0126]], [[K25_0_8]]
+; CHECK-NEXT:    [[SUB32_10:%.*]] = sub i32 [[N20_0126]], [[K25_0_9]]
+; CHECK-NEXT:    [[SUB32_11:%.*]] = sub i32 [[N20_0126]], [[K25_0_10]]
+; CHECK-NEXT:    [[SUB32_12:%.*]] = sub i32 [[N20_0126]], [[K25_0_11]]
+; CHECK-NEXT:    [[SUB32_13:%.*]] = sub i32 [[N20_0126]], [[K25_0_12]]
+; CHECK-NEXT:    [[SUB32_14:%.*]] = sub i32 [[N20_0126]], [[K25_0_13]]
+; CHECK-NEXT:    [[SUB32_15:%.*]] = sub i32 [[N20_0126]], [[K25_0_14]]
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0124]]
 ; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX33]], align 4
-; CHECK-NEXT:    [[TMP7]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[TMP4]])
-; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT:    [[K25_0]] = add i32 [[K25_0124]], 1
-; CHECK-NEXT:    [[CMP29_NOT:%.*]] = icmp ugt i32 [[K25_0]], [[N20_0126]]
-; CHECK-NEXT:    br i1 [[CMP29_NOT]], label [[FOR_END37]], label [[FOR_BODY30]]
+; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0]]
+; CHECK-NEXT:    [[ARRAYIDX33_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_1]]
+; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_1]]
+; CHECK-NEXT:    [[ARRAYIDX33_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_2]]
+; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_2]]
+; CHECK-NEXT:    [[ARRAYIDX33_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_3]]
+; CHECK-NEXT:    [[ARRAYIDX31_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_3]]
+; CHECK-NEXT:    [[ARRAYIDX33_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_4]]
+; CHECK-NEXT:    [[ARRAYIDX31_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_4]]
+; CHECK-NEXT:    [[ARRAYIDX33_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_5]]
+; CHECK-NEXT:    [[ARRAYIDX31_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_5]]
+; CHECK-NEXT:    [[ARRAYIDX33_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_6]]
+; CHECK-NEXT:    [[ARRAYIDX31_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_6]]
+; CHECK-NEXT:    [[ARRAYIDX33_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_7]]
+; CHECK-NEXT:    [[ARRAYIDX31_8:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_7]]
+; CHECK-NEXT:    [[ARRAYIDX33_8:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_8]]
+; CHECK-NEXT:    [[ARRAYIDX31_9:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_8]]
+; CHECK-NEXT:    [[ARRAYIDX33_9:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_9]]
+; CHECK-NEXT:    [[ARRAYIDX31_10:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_9]]
+; CHECK-NEXT:    [[ARRAYIDX33_10:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_10]]
+; CHECK-NEXT:    [[ARRAYIDX31_11:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_10]]
+; CHECK-NEXT:    [[ARRAYIDX33_11:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_11]]
+; CHECK-NEXT:    [[ARRAYIDX31_12:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_11]]
+; CHECK-NEXT:    [[ARRAYIDX33_12:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_12]]
+; CHECK-NEXT:    [[ARRAYIDX31_13:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_12]]
+; CHECK-NEXT:    [[ARRAYIDX33_13:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_13]]
+; CHECK-NEXT:    [[ARRAYIDX31_14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_13]]
+; CHECK-NEXT:    [[ARRAYIDX33_14:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_14]]
+; CHECK-NEXT:    [[ARRAYIDX31_15:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_14]]
+; CHECK-NEXT:    [[ARRAYIDX33_15:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_15]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX31_1]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX33_1]], align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX31_2]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[ARRAYIDX33_2]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[ARRAYIDX31_3]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[ARRAYIDX33_3]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr [[ARRAYIDX31_4]], align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = load float, ptr [[ARRAYIDX33_4]], align 4
+; CHECK-NEXT:    [[TMP41:%.*]] = load float, ptr [[ARRAYIDX31_5]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[ARRAYIDX33_5]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = load float, ptr [[ARRAYIDX31_6]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load float, ptr [[ARRAYIDX33_6]], align 4
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[ARRAYIDX31_7]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, ptr [[ARRAYIDX33_7]], align 4
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[ARRAYIDX31_8]], align 4
+; CHECK-NEXT:    [[TMP48:%.*]] = load float, ptr [[ARRAYIDX33_8]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[ARRAYIDX31_9]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load float, ptr [[ARRAYIDX33_9]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[ARRAYIDX31_10]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[ARRAYIDX33_10]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[ARRAYIDX31_11]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load float, ptr [[ARRAYIDX33_11]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load float, ptr [[ARRAYIDX31_12]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = load float, ptr [[ARRAYIDX33_12]], align 4
+; CHECK-NEXT:    [[TMP57:%.*]] = load float, ptr [[ARRAYIDX31_13]], align 4
+; CHECK-NEXT:    [[TMP58:%.*]] = load float, ptr [[ARRAYIDX33_13]], align 4
+; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[ARRAYIDX31_14]], align 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load float, ptr [[ARRAYIDX33_14]], align 4
+; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[ARRAYIDX31_15]], align 4
+; CHECK-NEXT:    [[TMP62:%.*]] = load float, ptr [[ARRAYIDX33_15]], align 4
+; CHECK-NEXT:    [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP31]], float [[TMP32]], float [[DOTPHI9]])
+; CHECK-NEXT:    [[TMP64]] = tail call float @llvm.fmuladd.f32(float [[TMP33]], float [[TMP34]], float [[DOTPHI10]])
+; CHECK-NEXT:    [[TMP65]] = tail call float @llvm.fmuladd.f32(float [[TMP35]], float [[TMP36]], float [[DOTPHI11]])
+; CHECK-NEXT:    [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[DOTPHI12]])
+; CHECK-NEXT:    [[TMP67]] = tail call float @llvm.fmuladd.f32(float [[TMP39]], float [[TMP40]], float [[DOTPHI13]])
+; CHECK-NEXT:    [[TMP68]] = tail call float @llvm.fmuladd.f32(float [[TMP41]], float [[TMP42]], float [[DOTPHI14]])
+; CHECK-NEXT:    [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP43]], float [[TMP44]], float [[DOTPHI15]])
+; CHECK-NEXT:    [[TMP70]] = tail call float @llvm.fmuladd.f32(float [[TMP45]], float [[TMP46]], float [[DOTPHI16]])
+; CHECK-NEXT:    [[TMP71]] = tail call float @llvm.fmuladd.f32(float [[TMP47]], float [[TMP48]], float [[DOTPHI17]])
+; CHECK-NEXT:    [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP49]], float [[TMP50]], float [[DOTPHI18]])
+; CHECK-NEXT:    [[TMP73]] = tail call float @llvm.fmuladd.f32(float [[TMP51]], float [[TMP52]], float [[DOTPHI19]])
+; CHECK-NEXT:    [[TMP74]] = tail call float @llvm.fmuladd.f32(float [[TMP53]], float [[TMP54]], float [[DOTPHI20]])
+; CHECK-NEXT:    [[TMP75]] = tail call float @llvm.fmuladd.f32(float [[TMP55]], float [[TMP56]], float [[DOTPHI21]])
+; CHECK-NEXT:    [[TMP76]] = tail call float @llvm.fmuladd.f32(float [[TMP57]], float [[TMP58]], float [[DOTPHI22]])
+; CHECK-NEXT:    [[TMP77]] = tail call float @llvm.fmuladd.f32(float [[TMP59]], float [[TMP60]], float [[DOTPHI23]])
+; CHECK-NEXT:    [[TMP78]] = tail call float @llvm.fmuladd.f32(float [[TMP61]], float [[TMP62]], float [[DOTPHI24]])
+; CHECK-NEXT:    [[CMP29_NOT_15:%.*]] = icmp ult i32 [[K25_0_15]], [[ADD60]]
+; CHECK-NEXT:    br i1 [[CMP29_NOT_15]], label [[FOR_BODY30_15]], label [[FOR_END37_LOOPEXIT:%.*]]
+; CHECK:       for.end37.loopexit:
+; CHECK-NEXT:    [[SUM45:%.*]] = fadd float [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[SUM46:%.*]] = fadd float [[TMP65]], [[TMP66]]
+; CHECK-NEXT:    [[SUM47:%.*]] = fadd float [[TMP67]], [[TMP68]]
+; CHECK-NEXT:    [[SUM48:%.*]] = fadd float [[TMP69]], [[TMP70]]
+; CHECK-NEXT:    [[SUM49:%.*]] = fadd float [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[SUM50:%.*]] = fadd float [[TMP73]], [[TMP74]]
+; CHECK-NEXT:    [[SUM51:%.*]] = fadd float [[TMP75]], [[TMP76]]
+; CHECK-NEXT:    [[SUM52:%.*]] = fadd float [[TMP77]], [[TMP78]]
+; CHECK-NEXT:    [[SUM53:%.*]] = fadd float [[SUM45]], [[SUM46]]
+; CHECK-NEXT:    [[SUM54:%.*]] = fadd float [[SUM47]], [[SUM48]]
+; CHECK-NEXT:    [[SUM55:%.*]] = fadd float [[SUM49]], [[SUM50]]
+; CHECK-NEXT:    [[SUM56:%.*]] = fadd float [[SUM51]], [[SUM52]]
+; CHECK-NEXT:    [[SUM57:%.*]] = fadd float [[SUM53]], [[SUM54]]
+; CHECK-NEXT:    [[SUM58:%.*]] = fadd float [[SUM55]], [[SUM56]]
+; CHECK-NEXT:    [[SUM59:%.*]] = fadd float [[SUM57]], [[SUM58]]
+; CHECK-NEXT:    br label [[FOR_END164]]
+; CHECK:       for.end164:
+; CHECK-NEXT:    [[PHI_SUM:%.*]] = phi i32 [ [[K25_0122]], [[FOR_BODY24]] ], [ [[K25_0_15]], [[FOR_END37_LOOPEXIT]] ]
+; CHECK-NEXT:    [[PHI_FLOAT:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY24]] ], [ [[SUM59]], [[FOR_END37_LOOPEXIT]] ]
+; CHECK-NEXT:    store float [[PHI_FLOAT]], ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[CMP182_NOT587:%.*]] = icmp ugt i32 [[PHI_SUM]], [[N20_0126]]
+; CHECK-NEXT:    br i1 [[CMP182_NOT587]], label [[FOR_END37]], label [[FOR_BODY30_CLONE:%.*]]
+; CHECK:       for.body30.clone:
+; CHECK-NEXT:    [[TMP79:%.*]] = phi float [ [[TMP82:%.*]], [[FOR_BODY30_CLONE]] ], [ [[PHI_FLOAT]], [[FOR_END164]] ]
+; CHECK-NEXT:    [[K25_0124_CLONE:%.*]] = phi i32 [ [[K25_0_CLONE:%.*]], [[FOR_BODY30_CLONE]] ], [ [[PHI_SUM]], [[FOR_END164]] ]
+; CHECK-NEXT:    [[ARRAYIDX31_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0124_CLONE]]
+; CHECK-NEXT:    [[TMP80:%.*]] = load float, ptr [[ARRAYIDX31_CLONE]], align 4
+; CHECK-NEXT:    [[SUB32_CLONE:%.*]] = sub i32 [[N20_0126]], [[K25_0124_CLONE]]
+; CHECK-NEXT:    [[ARRAYIDX33_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_CLONE]]
+; CHECK-NEXT:    [[TMP81:%.*]] = load float, ptr [[ARRAYIDX33_CLONE]], align 4
+; CHECK-NEXT:    [[TMP82]] = tail call float @llvm.fmuladd.f32(float [[TMP80]], float [[TMP81]], float [[TMP79]])
+; CHECK-NEXT:    [[K25_0_CLONE]] = add i32 [[K25_0124_CLONE]], 1
+; CHECK-NEXT:    [[CMP29_NOT_CLONE:%.*]] = icmp ugt i32 [[K25_0_CLONE]], [[N20_0126]]
+; CHECK-NEXT:    br i1 [[CMP29_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE25:%.*]], label [[FOR_BODY30_CLONE]]
+; CHECK:       for.cond.for.end_crit_edge25:
+; CHECK-NEXT:    store float [[TMP82]], ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    br label [[FOR_END37]]
 ; CHECK:       for.end37:
 ; CHECK-NEXT:    [[INC39]] = add nuw nsw i32 [[N20_0126]], 1
 ; CHECK-NEXT:    [[EXITCOND133_NOT:%.*]] = icmp eq i32 [[INC39]], [[LSIG_0]]
-; CHECK-NEXT:    br i1 [[EXITCOND133_NOT]], label [[FOR_COND42_PREHEADER]], label [[FOR_BODY24]]
+; CHECK-NEXT:    br i1 [[EXITCOND133_NOT]], label [[FOR_COND42_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY24]]
 ; CHECK:       for.body47:
 ; CHECK-NEXT:    [[N41_0131:%.*]] = phi i32 [ [[LSIG_0]], [[FOR_BODY47_LR_PH]] ], [ [[INC66:%.*]], [[FOR_END64:%.*]] ]
 ; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[CONVOUT]], i32 [[N41_0131]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX51]], align 4
 ; CHECK-NEXT:    [[SUB52:%.*]] = sub nsw i32 [[N41_0131]], [[LKERN_0]]
 ; CHECK-NEXT:    [[K50_0127:%.*]] = add i32 [[SUB52]], 1
-; CHECK-NEXT:    [[CMP56_NOT128:%.*]] = icmp ugt i32 [[K50_0127]], [[SUB54]]
-; CHECK-NEXT:    br i1 [[CMP56_NOT128]], label [[FOR_END64]], label [[FOR_BODY57:%.*]]
-; CHECK:       for.body57:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP11:%.*]], [[FOR_BODY57]] ], [ 0.000000e+00, [[FOR_BODY47]] ]
-; CHECK-NEXT:    [[K50_0129:%.*]] = phi i32 [ [[K50_0:%.*]], [[FOR_BODY57]] ], [ [[K50_0127]], [[FOR_BODY47]] ]
-; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0129]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX58]], align 4
+; CHECK-NEXT:    [[ADD207_NEG:%.*]] = xor i32 [[SUB52]], -1
+; CHECK-NEXT:    [[ADD211:%.*]] = add i32 [[ADD207_NEG]], [[LSIG_0]]
+; CHECK-NEXT:    [[DIV212535:%.*]] = and i32 [[ADD211]], -8
+; CHECK-NEXT:    [[ADD214:%.*]] = add i32 [[DIV212535]], [[K50_0127]]
+; CHECK-NEXT:    [[CMP56_NOT128:%.*]] = icmp ult i32 [[K50_0127]], [[ADD214]]
+; CHECK-NEXT:    br i1 [[CMP56_NOT128]], label [[FOR_BODY57_PREHEADER:%.*]], label [[FOR_END16434:%.*]]
+; CHECK:       for.body57.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY57_7:%.*]]
+; CHECK:       for.body57.7:
+; CHECK-NEXT:    [[K50_0129:%.*]] = phi i32 [ [[K50_0127]], [[FOR_BODY57_PREHEADER]] ], [ [[K50_0_7:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI26:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP99:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI27:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP100:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI28:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP101:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI29:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP102:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI30:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP103:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI31:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP104:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI32:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP105:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[DOTPHI33:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP106:%.*]], [[FOR_BODY57_7]] ]
+; CHECK-NEXT:    [[K50_0:%.*]] = add i32 [[K50_0129]], 1
+; CHECK-NEXT:    [[K50_0_1:%.*]] = add i32 [[K50_0129]], 2
+; CHECK-NEXT:    [[K50_0_2:%.*]] = add i32 [[K50_0129]], 3
+; CHECK-NEXT:    [[K50_0_3:%.*]] = add i32 [[K50_0129]], 4
+; CHECK-NEXT:    [[K50_0_4:%.*]] = add i32 [[K50_0129]], 5
+; CHECK-NEXT:    [[K50_0_5:%.*]] = add i32 [[K50_0129]], 6
+; CHECK-NEXT:    [[K50_0_6:%.*]] = add i32 [[K50_0129]], 7
+; CHECK-NEXT:    [[K50_0_7]] = add i32 [[K50_0129]], 8
 ; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[N41_0131]], [[K50_0129]]
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[N41_0131]], [[K50_0]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[N41_0131]], [[K50_0_1]]
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[N41_0131]], [[K50_0_2]]
+; CHECK-NEXT:    [[SUB59_4:%.*]] = sub i32 [[N41_0131]], [[K50_0_3]]
+; CHECK-NEXT:    [[SUB59_5:%.*]] = sub i32 [[N41_0131]], [[K50_0_4]]
+; CHECK-NEXT:    [[SUB59_6:%.*]] = sub i32 [[N41_0131]], [[K50_0_5]]
+; CHECK-NEXT:    [[SUB59_7:%.*]] = sub i32 [[N41_0131]], [[K50_0_6]]
+; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0129]]
 ; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX60]], align 4
-; CHECK-NEXT:    [[TMP11]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP10]], float [[TMP8]])
-; CHECK-NEXT:    store float [[TMP11]], ptr [[ARRAYIDX51]], align 4
-; CHECK-NEXT:    [[K50_0]] = add i32 [[K50_0129]], 1
-; CHECK-NEXT:    [[CMP56_NOT:%.*]] = icmp ugt i32 [[K50_0]], [[SUB54]]
-; CHECK-NEXT:    br i1 [[CMP56_NOT]], label [[FOR_END64]], label [[FOR_BODY57]]
+; CHECK-NEXT:    [[ARRAYIDX58_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0]]
+; CHECK-NEXT:    [[ARRAYIDX60_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_1]]
+; CHECK-NEXT:    [[ARRAYIDX58_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_1]]
+; CHECK-NEXT:    [[ARRAYIDX60_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_2]]
+; CHECK-NEXT:    [[ARRAYIDX58_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_2]]
+; CHECK-NEXT:    [[ARRAYIDX60_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_3]]
+; CHECK-NEXT:    [[ARRAYIDX58_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_3]]
+; CHECK-NEXT:    [[ARRAYIDX60_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_4]]
+; CHECK-NEXT:    [[ARRAYIDX58_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_4]]
+; CHECK-NEXT:    [[ARRAYIDX60_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_5]]
+; CHECK-NEXT:    [[ARRAYIDX58_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_5]]
+; CHECK-NEXT:    [[ARRAYIDX60_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_6]]
+; CHECK-NEXT:    [[ARRAYIDX58_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_6]]
+; CHECK-NEXT:    [[ARRAYIDX60_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_7]]
+; CHECK-NEXT:    [[TMP83:%.*]] = load float, ptr [[ARRAYIDX58]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = load float, ptr [[ARRAYIDX60]], align 4
+; CHECK-NEXT:    [[TMP85:%.*]] = load float, ptr [[ARRAYIDX58_1]], align 4
+; CHECK-NEXT:    [[TMP86:%.*]] = load float, ptr [[ARRAYIDX60_1]], align 4
+; CHECK-NEXT:    [[TMP87:%.*]] = load float, ptr [[ARRAYIDX58_2]], align 4
+; CHECK-NEXT:    [[TMP88:%.*]] = load float, ptr [[ARRAYIDX60_2]], align 4
+; CHECK-NEXT:    [[TMP89:%.*]] = load float, ptr [[ARRAYIDX58_3]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = load float, ptr [[ARRAYIDX60_3]], align 4
+; CHECK-NEXT:    [[TMP91:%.*]] = load float, ptr [[ARRAYIDX58_4]], align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = load float, ptr [[ARRAYIDX60_4]], align 4
+; CHECK-NEXT:    [[TMP93:%.*]] = load float, ptr [[ARRAYIDX58_5]], align 4
+; CHECK-NEXT:    [[TMP94:%.*]] = load float, ptr [[ARRAYIDX60_5]], align 4
+; CHECK-NEXT:    [[TMP95:%.*]] = load float, ptr [[ARRAYIDX58_6]], align 4
+; CHECK-NEXT:    [[TMP96:%.*]] = load float, ptr [[ARRAYIDX60_6]], align 4
+; CHECK-NEXT:    [[TMP97:%.*]] = load float, ptr [[ARRAYIDX58_7]], align 4
+; CHECK-NEXT:    [[TMP98:%.*]] = load float, ptr [[ARRAYIDX60_7]], align 4
+; CHECK-NEXT:    [[TMP99]] = tail call float @llvm.fmuladd.f32(float [[TMP83]], float [[TMP84]], float [[DOTPHI26]])
+; CHECK-NEXT:    [[TMP100]] = tail call float @llvm.fmuladd.f32(float [[TMP85]], float [[TMP86]], float [[DOTPHI27]])
+; CHECK-NEXT:    [[TMP101]] = tail call float @llvm.fmuladd.f32(float [[TMP87]], float [[TMP88]], float [[DOTPHI28]])
+; CHECK-NEXT:    [[TMP102]] = tail call float @llvm.fmuladd.f32(float [[TMP89]], float [[TMP90]], float [[DOTPHI29]])
+; CHECK-NEXT:    [[TMP103]] = tail call float @llvm.fmuladd.f32(float [[TMP91]], float [[TMP92]], float [[DOTPHI30]])
+; CHECK-NEXT:    [[TMP104]] = tail call float @llvm.fmuladd.f32(float [[TMP93]], float [[TMP94]], float [[DOTPHI31]])
+; CHECK-NEXT:    [[TMP105]] = tail call float @llvm.fmuladd.f32(float [[TMP95]], float [[TMP96]], float [[DOTPHI32]])
+; CHECK-NEXT:    [[TMP106]] = tail call float @llvm.fmuladd.f32(float [[TMP97]], float [[TMP98]], float [[DOTPHI33]])
+; CHECK-NEXT:    [[CMP56_NOT_7:%.*]] = icmp ult i32 [[K50_0_7]], [[ADD214]]
+; CHECK-NEXT:    br i1 [[CMP56_NOT_7]], label [[FOR_BODY57_7]], label [[FOR_END64_LOOPEXIT:%.*]]
+; CHECK:       for.end64.loopexit:
+; CHECK-NEXT:    [[SUM60:%.*]] = fadd float [[TMP99]], [[TMP100]]
+; CHECK-NEXT:    [[SUM61:%.*]] = fadd float [[TMP101]], [[TMP102]]
+; CHECK-NEXT:    [[SUM62:%.*]] = fadd float [[TMP103]], [[TMP104]]
+; CHECK-NEXT:    [[SUM63:%.*]] = fadd float [[TMP105]], [[TMP106]]
+; CHECK-NEXT:    [[SUM64:%.*]] = fadd float [[SUM60]], [[SUM61]]
+; CHECK-NEXT:    [[SUM65:%.*]] = fadd float [[SUM62]], [[SUM63]]
+; CHECK-NEXT:    [[SUM66:%.*]] = fadd float [[SUM64]], [[SUM65]]
+; CHECK-NEXT:    br label [[FOR_END16434]]
+; CHECK:       for.end16434:
+; CHECK-NEXT:    [[PHI_SUM35:%.*]] = phi i32 [ [[K50_0127]], [[FOR_BODY47]] ], [ [[K50_0_7]], [[FOR_END64_LOOPEXIT]] ]
+; CHECK-NEXT:    [[PHI_FLOAT36:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY47]] ], [ [[SUM66]], [[FOR_END64_LOOPEXIT]] ]
+; CHECK-NEXT:    store float [[PHI_FLOAT36]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[CMP182_NOT58737:%.*]] = icmp ugt i32 [[PHI_SUM35]], [[SUB54]]
+; CHECK-NEXT:    br i1 [[CMP182_NOT58737]], label [[FOR_END64]], label [[FOR_BODY57_CLONE:%.*]]
+; CHECK:       for.body57.clone:
+; CHECK-NEXT:    [[TMP107:%.*]] = phi float [ [[TMP110:%.*]], [[FOR_BODY57_CLONE]] ], [ [[PHI_FLOAT36]], [[FOR_END16434]] ]
+; CHECK-NEXT:    [[K50_0129_CLONE:%.*]] = phi i32 [ [[K50_0_CLONE:%.*]], [[FOR_BODY57_CLONE]] ], [ [[PHI_SUM35]], [[FOR_END16434]] ]
+; CHECK-NEXT:    [[ARRAYIDX58_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0129_CLONE]]
+; CHECK-NEXT:    [[TMP108:%.*]] = load float, ptr [[ARRAYIDX58_CLONE]], align 4
+; CHECK-NEXT:    [[SUB59_CLONE:%.*]] = sub i32 [[N41_0131]], [[K50_0129_CLONE]]
+; CHECK-NEXT:    [[ARRAYIDX60_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_CLONE]]
+; CHECK-NEXT:    [[TMP109:%.*]] = load float, ptr [[ARRAYIDX60_CLONE]], align 4
+; CHECK-NEXT:    [[TMP110]] = tail call float @llvm.fmuladd.f32(float [[TMP108]], float [[TMP109]], float [[TMP107]])
+; CHECK-NEXT:    [[K50_0_CLONE]] = add i32 [[K50_0129_CLONE]], 1
+; CHECK-NEXT:    [[CMP56_NOT_CLONE:%.*]] = icmp ugt i32 [[K50_0_CLONE]], [[SUB54]]
+; CHECK-NEXT:    br i1 [[CMP56_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE38:%.*]], label [[FOR_BODY57_CLONE]]
+; CHECK:       for.cond.for.end_crit_edge38:
+; CHECK-NEXT:    store float [[TMP110]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    br label [[FOR_END64]]
 ; CHECK:       for.end64:
 ; CHECK-NEXT:    [[INC66]] = add nsw i32 [[N41_0131]], 1
 ; CHECK-NEXT:    [[EXITCOND134_NOT:%.*]] = icmp eq i32 [[INC66]], [[SUB44]]
-; CHECK-NEXT:    br i1 [[EXITCOND134_NOT]], label [[RETURN]], label [[FOR_BODY47]]
+; CHECK-NEXT:    br i1 [[EXITCOND134_NOT]], label [[RETURN_LOOPEXIT:%.*]], label [[FOR_BODY47]]
+; CHECK:       return.loopexit:
+; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND42_PREHEADER]] ], [ 0, [[FOR_END64]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND42_PREHEADER]] ], [ 0, [[RETURN_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll
index cd8f939112a541..3091bef36bf897 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_corr_f32_ansi(ptr noundef readonly %Signal, i32 noundef %siglen, ptr noundef readonly %Pattern, i32 noundef %patlen, ptr noundef writeonly %dest) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_corr_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noundef readonly [[PATTERN:%.*]], i32 noundef [[PATLEN:%.*]], ptr noundef writeonly [[DEST:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noalias noundef readonly [[PATTERN:%.*]], i32 noundef [[PATLEN:%.*]], ptr noalias noundef writeonly [[DEST:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PATLEN_NEG:%.*]] = sub i32 0, [[PATLEN]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[SIGNAL]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[PATTERN]], null
 ; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]]
@@ -11,39 +12,232 @@ define dso_local noundef i32 @dsps_corr_f32_ansi(ptr noundef readonly %Signal, i
 ; CHECK-NEXT:    [[OR_COND33:%.*]] = or i1 [[OR_COND]], [[CMP4]]
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[SIGLEN]], [[PATLEN]]
 ; CHECK-NEXT:    [[OR_COND34:%.*]] = or i1 [[CMP7]], [[OR_COND33]]
-; CHECK-NEXT:    br i1 [[OR_COND34]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
-; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    br i1 [[OR_COND34]], label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[SIGLEN]], [[PATLEN]]
-; CHECK-NEXT:    [[CMP1235_NOT:%.*]] = icmp eq i32 [[PATLEN]], 0
-; CHECK-NEXT:    br i1 [[CMP1235_NOT]], label [[FOR_COND11_PREHEADER_PREHEADER:%.*]], label [[FOR_COND11_PREHEADER_US:%.*]]
+; CHECK-NEXT:    [[SUB6:%.*]] = add nsw i32 [[SUB]], -15
+; CHECK-NEXT:    [[CMP1235_NOT:%.*]] = icmp sgt i32 [[SUB]], 15
+; CHECK-NEXT:    br i1 [[CMP1235_NOT]], label [[FOR_COND8_PREHEADER_LR_PH:%.*]], label [[FOR_COND91_PREHEADER:%.*]]
+; CHECK:       for.cond8.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP9242:%.*]] = icmp sgt i32 [[PATLEN]], 0
+; CHECK-NEXT:    [[SCEVGEP62:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 60
+; CHECK-NEXT:    [[SCEVGEP66:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 56
+; CHECK-NEXT:    [[SCEVGEP68:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 52
+; CHECK-NEXT:    [[SCEVGEP70:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 48
+; CHECK-NEXT:    [[SCEVGEP72:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 44
+; CHECK-NEXT:    [[SCEVGEP74:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 40
+; CHECK-NEXT:    [[SCEVGEP76:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 36
+; CHECK-NEXT:    [[SCEVGEP78:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 32
+; CHECK-NEXT:    [[SCEVGEP80:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 28
+; CHECK-NEXT:    [[SCEVGEP82:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 24
+; CHECK-NEXT:    [[SCEVGEP84:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 20
+; CHECK-NEXT:    [[SCEVGEP86:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 16
+; CHECK-NEXT:    [[SCEVGEP88:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 12
+; CHECK-NEXT:    [[SCEVGEP90:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 8
+; CHECK-NEXT:    [[SCEVGEP92:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 4
+; CHECK-NEXT:    br label [[FOR_COND8_PREHEADER:%.*]]
+; CHECK:       for.cond8.preheader:
+; CHECK-NEXT:    [[LSR_IV95:%.*]] = phi ptr [ [[SCEVGEP96:%.*]], [[FOR_COND_CLEANUP:%.*]] ], [ [[SIGNAL]], [[FOR_COND8_PREHEADER_LR_PH]] ]
+; CHECK-NEXT:    [[N_0276:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_LR_PH]] ], [ [[ADD89:%.*]], [[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT:    br i1 [[CMP9242]], label [[FOR_BODY10_LR_PH:%.*]], label [[FOR_COND_CLEANUP]]
+; CHECK:       for.body10.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY14_US_UNROLL:%.*]]
+; CHECK:       for.cond91.preheader.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND91_PREHEADER]]
+; CHECK:       for.cond91.preheader:
+; CHECK-NEXT:    [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[ADD89]], [[FOR_COND91_PREHEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[CMP92_NOT282:%.*]] = icmp sgt i32 [[N_0_LCSSA]], [[SUB]]
+; CHECK-NEXT:    br i1 [[CMP92_NOT282]], label [[RETURN]], label [[FOR_COND95_PREHEADER_LR_PH:%.*]]
+; CHECK:       for.cond95.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP92678:%.*]] = icmp sgt i32 [[PATLEN]], 0
+; CHECK-NEXT:    br i1 [[CMP92678]], label [[FOR_COND11_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND11_PREHEADER_PREHEADER:%.*]]
 ; CHECK:       for.cond11.preheader.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[SIGLEN]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 4
-; CHECK-NEXT:    tail call void @llvm.memset.p0.i32(ptr nonnull align 4 [[DEST]], i8 0, i32 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_0_LCSSA]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST]], i32 [[TMP0]]
+; CHECK-NEXT:    [[N_0_LCSSA_NEG:%.*]] = sub i32 0, [[N_0_LCSSA]]
+; CHECK-NEXT:    [[DOTNEG:%.*]] = add i32 [[SIGLEN]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[DOTNEG]], [[PATLEN_NEG]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], [[N_0_LCSSA_NEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 2
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i32(ptr nonnull align 4 [[SCEVGEP]], i8 0, i32 [[TMP3]], i1 false)
 ; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       for.cond11.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[SIGLEN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[PATLEN_NEG]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[N_0_LCSSA]], 2
+; CHECK-NEXT:    [[SCEVGEP102:%.*]] = getelementptr i8, ptr [[SIGNAL]], i32 [[TMP6]]
+; CHECK-NEXT:    br label [[FOR_COND11_PREHEADER_US:%.*]]
 ; CHECK:       for.cond11.preheader.us:
-; CHECK-NEXT:    [[N_038_US:%.*]] = phi i32 [ [[INC18_US:%.*]], [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[SIGNAL]], i32 [[N_038_US]]
+; CHECK-NEXT:    [[LSR_IV103:%.*]] = phi ptr [ [[SCEVGEP104:%.*]], [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US:%.*]] ], [ [[SCEVGEP102]], [[FOR_COND11_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[N_038_US:%.*]] = phi i32 [ [[INC18_US:%.*]], [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US]] ], [ [[N_0_LCSSA]], [[FOR_COND11_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY14_US:%.*]]
 ; CHECK:       for.body14.us:
-; CHECK-NEXT:    [[M_037_US:%.*]] = phi i32 [ 0, [[FOR_COND11_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY14_US]] ]
-; CHECK-NEXT:    [[K_CORR_036_US:%.*]] = phi float [ 0.000000e+00, [[FOR_COND11_PREHEADER_US]] ], [ [[TMP5:%.*]], [[FOR_BODY14_US]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr float, ptr [[TMP2]], i32 [[M_037_US]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[ARRAYIDX15_US:%.*]] = getelementptr inbounds float, ptr [[PATTERN]], i32 [[M_037_US]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX15_US]], align 4
-; CHECK-NEXT:    [[TMP5]] = tail call float @llvm.fmuladd.f32(float [[TMP3]], float [[TMP4]], float [[K_CORR_036_US]])
-; CHECK-NEXT:    [[INC_US]] = add nuw i32 [[M_037_US]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_US]], [[PATLEN]]
+; CHECK-NEXT:    [[LSR_IV105:%.*]] = phi ptr [ [[SCEVGEP106:%.*]], [[FOR_BODY14_US]] ], [ [[LSR_IV103]], [[FOR_COND11_PREHEADER_US]] ]
+; CHECK-NEXT:    [[LSR_IV100:%.*]] = phi ptr [ [[SCEVGEP101:%.*]], [[FOR_BODY14_US]] ], [ [[PATTERN]], [[FOR_COND11_PREHEADER_US]] ]
+; CHECK-NEXT:    [[LSR_IV98:%.*]] = phi i32 [ [[LSR_IV_NEXT99:%.*]], [[FOR_BODY14_US]] ], [ [[PATLEN]], [[FOR_COND11_PREHEADER_US]] ]
+; CHECK-NEXT:    [[K_CORR_036_US:%.*]] = phi float [ 0.000000e+00, [[FOR_COND11_PREHEADER_US]] ], [ [[TMP9:%.*]], [[FOR_BODY14_US]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[LSR_IV105]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[LSR_IV100]], align 4
+; CHECK-NEXT:    [[TMP9]] = tail call float @llvm.fmuladd.f32(float [[TMP7]], float [[TMP8]], float [[K_CORR_036_US]])
+; CHECK-NEXT:    [[LSR_IV_NEXT99]] = add i32 [[LSR_IV98]], -1
+; CHECK-NEXT:    [[SCEVGEP101]] = getelementptr i8, ptr [[LSR_IV100]], i32 4
+; CHECK-NEXT:    [[SCEVGEP106]] = getelementptr i8, ptr [[LSR_IV105]], i32 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT99]], 0
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US]], label [[FOR_BODY14_US]]
 ; CHECK:       for.cond11.for.cond.cleanup13_crit_edge.us:
 ; CHECK-NEXT:    [[ARRAYIDX16_US:%.*]] = getelementptr inbounds float, ptr [[DEST]], i32 [[N_038_US]]
-; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX16_US]], align 4
+; CHECK-NEXT:    store float [[TMP9]], ptr [[ARRAYIDX16_US]], align 4
 ; CHECK-NEXT:    [[INC18_US]] = add nuw i32 [[N_038_US]], 1
-; CHECK-NEXT:    [[CMP10_NOT_US_NOT:%.*]] = icmp ult i32 [[N_038_US]], [[SUB]]
-; CHECK-NEXT:    br i1 [[CMP10_NOT_US_NOT]], label [[FOR_COND11_PREHEADER_US]], label [[RETURN]]
+; CHECK-NEXT:    [[SCEVGEP104]] = getelementptr i8, ptr [[LSR_IV103]], i32 4
+; CHECK-NEXT:    [[CMP10_NOT_US_NOT:%.*]] = icmp eq i32 [[INC18_US]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[CMP10_NOT_US_NOT]], label [[RETURN_LOOPEXIT:%.*]], label [[FOR_COND11_PREHEADER_US]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP58:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP59:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP60:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP61:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP62:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP63:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP64:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP65:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP67:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP68:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP69:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP70:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP71:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP72:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP73:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[ADD89]] = add nuw nsw i32 [[N_0276]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i32 [[N_0276]], 1
+; CHECK-NEXT:    [[ADD17:%.*]] = or disjoint i32 [[N_0276]], 2
+; CHECK-NEXT:    [[ADD19:%.*]] = or disjoint i32 [[N_0276]], 3
+; CHECK-NEXT:    [[ADD21:%.*]] = or disjoint i32 [[N_0276]], 4
+; CHECK-NEXT:    [[ADD23:%.*]] = or disjoint i32 [[N_0276]], 5
+; CHECK-NEXT:    [[ADD25:%.*]] = or disjoint i32 [[N_0276]], 6
+; CHECK-NEXT:    [[ADD27:%.*]] = or disjoint i32 [[N_0276]], 7
+; CHECK-NEXT:    [[ADD29:%.*]] = or disjoint i32 [[N_0276]], 8
+; CHECK-NEXT:    [[ADD31:%.*]] = or disjoint i32 [[N_0276]], 9
+; CHECK-NEXT:    [[ADD33:%.*]] = or disjoint i32 [[N_0276]], 10
+; CHECK-NEXT:    [[ADD35:%.*]] = or disjoint i32 [[N_0276]], 11
+; CHECK-NEXT:    [[ADD37:%.*]] = or disjoint i32 [[N_0276]], 12
+; CHECK-NEXT:    [[ADD39:%.*]] = or disjoint i32 [[N_0276]], 13
+; CHECK-NEXT:    [[ADD41:%.*]] = or disjoint i32 [[N_0276]], 14
+; CHECK-NEXT:    [[ADD43:%.*]] = or disjoint i32 [[N_0276]], 15
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr float, ptr [[DEST]], i32 [[N_0276]]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD17]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD19]]
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD21]]
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD23]]
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD25]]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD27]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD29]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD31]]
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD33]]
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD35]]
+; CHECK-NEXT:    [[ARRAYIDX38:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD37]]
+; CHECK-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD39]]
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD41]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD43]]
+; CHECK-NEXT:    store float [[TMP10]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store float [[TMP11]], ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    store float [[TMP12]], ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    store float [[TMP13]], ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    store float [[TMP15]], ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    store float [[TMP17]], ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    store float [[TMP18]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    store float [[TMP19]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    store float [[TMP20]], ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    store float [[TMP21]], ptr [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    store float [[TMP22]], ptr [[ARRAYIDX38]], align 4
+; CHECK-NEXT:    store float [[TMP23]], ptr [[ARRAYIDX40]], align 4
+; CHECK-NEXT:    store float [[TMP24]], ptr [[ARRAYIDX42]], align 4
+; CHECK-NEXT:    store float [[TMP25]], ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[SCEVGEP96]] = getelementptr i8, ptr [[LSR_IV95]], i32 64
+; CHECK-NEXT:    [[CMP745:%.*]] = icmp slt i32 [[ADD89]], [[SUB6]]
+; CHECK-NEXT:    br i1 [[CMP745]], label [[FOR_COND8_PREHEADER]], label [[FOR_COND91_PREHEADER_LOOPEXIT]]
+; CHECK:       for.body14.us.unroll:
+; CHECK-NEXT:    [[LSR_IV63:%.*]] = phi i32 [ 0, [[FOR_BODY10_LR_PH]] ], [ [[LSR_IV_NEXT64:%.*]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[PATLEN]], [[FOR_BODY10_LR_PH]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[K_CORR_036_US_UNROLL:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP58]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP59]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP60]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP61]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP62]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP63]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP64]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP65]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP66]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP67]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP68]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP69]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP70]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP71]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP72]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[TMP40:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP73]], [[FOR_BODY14_US_UNROLL]] ]
+; CHECK-NEXT:    [[SCEVGEP94:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP97:%.*]] = getelementptr i8, ptr [[LSR_IV95]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP93:%.*]] = getelementptr i8, ptr [[SCEVGEP92]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP91:%.*]] = getelementptr i8, ptr [[SCEVGEP90]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP89:%.*]] = getelementptr i8, ptr [[SCEVGEP88]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP87:%.*]] = getelementptr i8, ptr [[SCEVGEP86]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP85:%.*]] = getelementptr i8, ptr [[SCEVGEP84]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP83:%.*]] = getelementptr i8, ptr [[SCEVGEP82]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP81:%.*]] = getelementptr i8, ptr [[SCEVGEP80]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP79:%.*]] = getelementptr i8, ptr [[SCEVGEP78]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP77:%.*]] = getelementptr i8, ptr [[SCEVGEP76]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP75:%.*]] = getelementptr i8, ptr [[SCEVGEP74]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP73:%.*]] = getelementptr i8, ptr [[SCEVGEP72]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP71:%.*]] = getelementptr i8, ptr [[SCEVGEP70]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP69:%.*]] = getelementptr i8, ptr [[SCEVGEP68]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP67:%.*]] = getelementptr i8, ptr [[SCEVGEP66]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[SCEVGEP65:%.*]] = getelementptr i8, ptr [[SCEVGEP62]], i32 [[LSR_IV63]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load float, ptr [[SCEVGEP94]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[SCEVGEP97]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = load float, ptr [[SCEVGEP93]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load float, ptr [[SCEVGEP91]], align 4
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[SCEVGEP89]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, ptr [[SCEVGEP87]], align 4
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[SCEVGEP85]], align 4
+; CHECK-NEXT:    [[TMP48:%.*]] = load float, ptr [[SCEVGEP83]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[SCEVGEP81]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load float, ptr [[SCEVGEP79]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[SCEVGEP77]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[SCEVGEP75]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[SCEVGEP73]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load float, ptr [[SCEVGEP71]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load float, ptr [[SCEVGEP69]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = load float, ptr [[SCEVGEP67]], align 4
+; CHECK-NEXT:    [[TMP57:%.*]] = load float, ptr [[SCEVGEP65]], align 4
+; CHECK-NEXT:    [[TMP58]] = tail call float @llvm.fmuladd.f32(float [[TMP42]], float [[TMP41]], float [[K_CORR_036_US_UNROLL]])
+; CHECK-NEXT:    [[TMP59]] = tail call float @llvm.fmuladd.f32(float [[TMP43]], float [[TMP41]], float [[TMP26]])
+; CHECK-NEXT:    [[TMP60]] = tail call float @llvm.fmuladd.f32(float [[TMP44]], float [[TMP41]], float [[TMP27]])
+; CHECK-NEXT:    [[TMP61]] = tail call float @llvm.fmuladd.f32(float [[TMP45]], float [[TMP41]], float [[TMP28]])
+; CHECK-NEXT:    [[TMP62]] = tail call float @llvm.fmuladd.f32(float [[TMP46]], float [[TMP41]], float [[TMP29]])
+; CHECK-NEXT:    [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP47]], float [[TMP41]], float [[TMP30]])
+; CHECK-NEXT:    [[TMP64]] = tail call float @llvm.fmuladd.f32(float [[TMP48]], float [[TMP41]], float [[TMP31]])
+; CHECK-NEXT:    [[TMP65]] = tail call float @llvm.fmuladd.f32(float [[TMP49]], float [[TMP41]], float [[TMP32]])
+; CHECK-NEXT:    [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP50]], float [[TMP41]], float [[TMP33]])
+; CHECK-NEXT:    [[TMP67]] = tail call float @llvm.fmuladd.f32(float [[TMP51]], float [[TMP41]], float [[TMP34]])
+; CHECK-NEXT:    [[TMP68]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP41]], float [[TMP35]])
+; CHECK-NEXT:    [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP53]], float [[TMP41]], float [[TMP36]])
+; CHECK-NEXT:    [[TMP70]] = tail call float @llvm.fmuladd.f32(float [[TMP54]], float [[TMP41]], float [[TMP37]])
+; CHECK-NEXT:    [[TMP71]] = tail call float @llvm.fmuladd.f32(float [[TMP55]], float [[TMP41]], float [[TMP38]])
+; CHECK-NEXT:    [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP56]], float [[TMP41]], float [[TMP39]])
+; CHECK-NEXT:    [[TMP73]] = tail call float @llvm.fmuladd.f32(float [[TMP57]], float [[TMP41]], float [[TMP40]])
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1
+; CHECK-NEXT:    [[LSR_IV_NEXT64]] = add nuw i32 [[LSR_IV63]], 4
+; CHECK-NEXT:    [[EXITCOND_NOT_UNROLL:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_UNROLL]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY14_US_UNROLL]]
+; CHECK:       return.loopexit:
+; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND11_PREHEADER_PREHEADER]] ], [ 0, [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND11_PREHEADER_PREHEADER]] ], [ 0, [[FOR_COND91_PREHEADER]] ], [ 0, [[RETURN_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll
index 2fe5f8edd108cc..af95e0500cf2c2 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll
@@ -1,37 +1,126 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_dotprod_f32_ansi(ptr nocapture noundef readonly %src1, ptr nocapture noundef readonly %src2, ptr nocapture noundef writeonly %dest, i32 noundef %len) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_dotprod_f32_ansi(
-; CHECK-SAME: ptr nocapture noundef readonly [[SRC1:%.*]], ptr nocapture noundef readonly [[SRC2:%.*]], ptr nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[SRC1:%.*]], ptr noalias nocapture noundef readonly [[SRC2:%.*]], ptr noalias nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_COND_PREHEADER1:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[LEN]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    [[CMP47110:%.*]] = icmp sgt i32 [[LEN]], 0
+; CHECK-NEXT:    br i1 [[CMP47110]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[ADD44:%.*]], [[FOR_END37:%.*]] ], [ [[TMP31:%.*]], [[FOR_BODY_CLONE]] ]
 ; CHECK-NEXT:    store float [[ACC_0_LCSSA]], ptr [[DEST]], align 4
 ; CHECK-NEXT:    ret i32 0
+; CHECK:       for.cond.preheader1:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[LEN]], -7
+; CHECK-NEXT:    [[CMP1113:%.*]] = icmp ugt i32 [[LEN]], 7
+; CHECK-NEXT:    br i1 [[CMP1113]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND31_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[LEN]], 2147483640
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond31.preheader:
+; CHECK-NEXT:    [[ACC0_0_LCSSA:%.*]] = phi float [ [[TMP4:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC1_0_LCSSA:%.*]] = phi float [ [[TMP7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC2_0_LCSSA:%.*]] = phi float [ [[TMP10:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC3_0_LCSSA:%.*]] = phi float [ [[TMP13:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC4_0_LCSSA:%.*]] = phi float [ [[TMP16:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC5_0_LCSSA:%.*]] = phi float [ [[TMP19:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC6_0_LCSSA:%.*]] = phi float [ [[TMP22:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC7_0_LCSSA:%.*]] = phi float [ [[TMP25:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER1]] ], [ [[TMP1]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP32132:%.*]] = icmp slt i32 [[I_0_LCSSA]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP32132]], label [[FOR_BODY33:%.*]], label [[FOR_END37]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ACC_07:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TMP3]] = tail call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP2]], float [[ACC_07]])
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[IF_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[I_0122:%.*]] = phi i32 [ [[ADD30:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC_07:%.*]] = phi float [ [[TMP4]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC1:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC2:%.*]] = phi float [ [[TMP10]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC3:%.*]] = phi float [ [[TMP13]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC4:%.*]] = phi float [ [[TMP16]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC5:%.*]] = phi float [ [[TMP19]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC6:%.*]] = phi float [ [[TMP22]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC7:%.*]] = phi float [ [[TMP25]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_0122]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_0122]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[TMP4]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[ACC_07]])
+; CHECK-NEXT:    [[ADD1:%.*]] = or disjoint i32 [[I_0122]], 1
+; CHECK-NEXT:    [[ARRAYIDX1_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1_1]], align 4
+; CHECK-NEXT:    [[TMP7]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[ACC1]])
+; CHECK-NEXT:    [[ADD2:%.*]] = or disjoint i32 [[I_0122]], 2
+; CHECK-NEXT:    [[ARRAYIDX2_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[TMP10]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC2]])
+; CHECK-NEXT:    [[ADD3:%.*]] = or disjoint i32 [[I_0122]], 3
+; CHECK-NEXT:    [[ARRAYIDX3_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX3_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4
+; CHECK-NEXT:    [[TMP13]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[ACC3]])
+; CHECK-NEXT:    [[ADD4:%.*]] = or disjoint i32 [[I_0122]], 4
+; CHECK-NEXT:    [[ARRAYIDX4_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX4_1]], align 4
+; CHECK-NEXT:    [[TMP16]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[ACC4]])
+; CHECK-NEXT:    [[ADD5:%.*]] = or disjoint i32 [[I_0122]], 5
+; CHECK-NEXT:    [[ARRAYIDX5_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX5_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX5_1]], align 4
+; CHECK-NEXT:    [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP17]], float [[TMP18]], float [[ACC5]])
+; CHECK-NEXT:    [[ADD6:%.*]] = or disjoint i32 [[I_0122]], 6
+; CHECK-NEXT:    [[ARRAYIDX6_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6_1]], align 4
+; CHECK-NEXT:    [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC6]])
+; CHECK-NEXT:    [[ADD7:%.*]] = or disjoint i32 [[I_0122]], 7
+; CHECK-NEXT:    [[ARRAYIDX7_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX7_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[ARRAYIDX7_1]], align 4
+; CHECK-NEXT:    [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP23]], float [[TMP24]], float [[ACC7]])
+; CHECK-NEXT:    [[ADD30]] = add nuw nsw i32 [[I_0122]], 8
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[ADD30]], [[SUB]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY]], label [[FOR_COND31_PREHEADER]]
+; CHECK:       for.body33:
+; CHECK-NEXT:    [[I_0833:%.*]] = phi i32 [ [[INC33:%.*]], [[FOR_BODY33]] ], [ [[I_0_LCSSA]], [[FOR_COND31_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC_0733:%.*]] = phi float [ [[TMP28:%.*]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_0833]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_0833]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[ARRAYIDX133]], align 4
+; CHECK-NEXT:    [[TMP28]] = tail call float @llvm.fmuladd.f32(float [[TMP26]], float [[TMP27]], float [[ACC_0733]])
+; CHECK-NEXT:    [[INC33]] = add nuw nsw i32 [[I_0833]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT33:%.*]] = icmp eq i32 [[INC33]], [[LEN]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT33]], label [[FOR_END37]], label [[FOR_BODY33]]
+; CHECK:       for.end37:
+; CHECK-NEXT:    [[ACC0_1_LCSSA:%.*]] = phi float [ [[TMP28]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM01:%.*]] = fadd float [[ACC1_0_LCSSA]], [[ACC0_1_LCSSA]]
+; CHECK-NEXT:    [[SUM23:%.*]] = fadd float [[ACC2_0_LCSSA]], [[ACC3_0_LCSSA]]
+; CHECK-NEXT:    [[SUM45:%.*]] = fadd float [[ACC4_0_LCSSA]], [[ACC5_0_LCSSA]]
+; CHECK-NEXT:    [[SUM67:%.*]] = fadd float [[ACC6_0_LCSSA]], [[ACC7_0_LCSSA]]
+; CHECK-NEXT:    [[SUM0123:%.*]] = fadd float [[SUM23]], [[SUM01]]
+; CHECK-NEXT:    [[SUM4567:%.*]] = fadd float [[SUM45]], [[SUM67]]
+; CHECK-NEXT:    [[ADD44]] = fadd float [[SUM4567]], [[SUM0123]]
+; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       for.body.clone:
 ; CHECK-NEXT:    [[I_08_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
-; CHECK-NEXT:    [[ACC_07_CLONE:%.*]] = phi float [ [[TMP6]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC_07_CLONE:%.*]] = phi float [ [[TMP31]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_08_CLONE]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_08_CLONE]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_CLONE]], align 4
-; CHECK-NEXT:    [[TMP6]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[ACC_07_CLONE]])
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX1_CLONE]], align 4
+; CHECK-NEXT:    [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[ACC_07_CLONE]])
 ; CHECK-NEXT:    [[INC_CLONE]] = add nuw nsw i32 [[I_08_CLONE]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[IF_END]], label [[FOR_BODY_CLONE]]
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll
index 8db7f9dd4c7882..60c76b1ad159d9 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll
@@ -1,28 +1,115 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local float @test_loop(ptr nocapture noundef readonly %data1, ptr nocapture noundef readonly %data2, i32 noundef %start_index, i32 noundef %end_index, i32 noundef %update1, i32 noundef %update2, float noundef %offset) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local float @test_loop(
-; CHECK-SAME: ptr nocapture noundef readonly [[DATA1:%.*]], ptr nocapture noundef readonly [[DATA2:%.*]], i32 noundef [[START_INDEX:%.*]], i32 noundef [[END_INDEX:%.*]], i32 noundef [[UPDATE1:%.*]], i32 noundef [[UPDATE2:%.*]], float noundef [[OFFSET:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[DATA1:%.*]], ptr noalias nocapture noundef readonly [[DATA2:%.*]], i32 noundef [[START_INDEX:%.*]], i32 noundef [[END_INDEX:%.*]], i32 noundef [[UPDATE1:%.*]], i32 noundef [[UPDATE2:%.*]], float noundef [[OFFSET:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[DATA1]], i32 [[UPDATE1]]
 ; CHECK-NEXT:    [[INVARIANT_GEP8:%.*]] = getelementptr float, ptr [[DATA2]], i32 [[UPDATE2]]
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[START_INDEX]], [[END_INDEX]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[START_INDEX]], [[ENTRY]] ]
-; CHECK-NEXT:    [[RESULT_011:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[END_INDEX]], -8
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[SUB]], [[START_INDEX]]
+; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_COND_PREHEADER:%.*]], label [[FOR_BODY_7:%.*]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    [[RESULT0_0_LCSSA:%.*]] = phi i32 [ [[START_INDEX]], [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_7:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_6:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_5:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA4:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_4:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_3:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA6:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_2:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA7:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_1:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0_0_LCSSA8:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RESULT0_0_LCSSA]], [[END_INDEX]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_CLONE:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.7:
+; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ [[START_INDEX]], [[ENTRY]] ], [ [[INC_7]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT6:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_6]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_5]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT4:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_4]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_3]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_2]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_1]], [[FOR_BODY_7]] ]
+; CHECK-NEXT:    [[RESULT0:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3]], [[FOR_BODY_7]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[I_012]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[GEP9:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[I_012]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[GEP9]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[OFFSET]])
-; CHECK-NEXT:    [[ADD3]] = fadd float [[RESULT_011]], [[TMP2]]
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_012]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[END_INDEX]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[ADD3]] = fadd float [[RESULT0]], [[TMP2]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[I_012]], 1
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP9_1:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[GEP9_1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP3]], float [[TMP4]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_1]] = fadd float [[RESULT1]], [[TMP5]]
+; CHECK-NEXT:    [[INC_1:%.*]] = add nsw i32 [[I_012]], 2
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[GEP_2]], align 4
+; CHECK-NEXT:    [[GEP9_2:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[GEP9_2]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_2]] = fadd float [[RESULT2]], [[TMP8]]
+; CHECK-NEXT:    [[INC_2:%.*]] = add nsw i32 [[I_012]], 3
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[GEP_3]], align 4
+; CHECK-NEXT:    [[GEP9_3:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[GEP9_3]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP10]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_3]] = fadd float [[RESULT3]], [[TMP11]]
+; CHECK-NEXT:    [[INC_3:%.*]] = add nsw i32 [[I_012]], 4
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[GEP_4]], align 4
+; CHECK-NEXT:    [[GEP9_4:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[GEP9_4]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_4]] = fadd float [[RESULT4]], [[TMP14]]
+; CHECK-NEXT:    [[INC_4:%.*]] = add nsw i32 [[I_012]], 5
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[GEP_5]], align 4
+; CHECK-NEXT:    [[GEP9_5:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[GEP9_5]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_5]] = fadd float [[RESULT5]], [[TMP17]]
+; CHECK-NEXT:    [[INC_5:%.*]] = add nsw i32 [[I_012]], 6
+; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[GEP_6]], align 4
+; CHECK-NEXT:    [[GEP9_6:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[GEP9_6]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP18]], float [[TMP19]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_6]] = fadd float [[RESULT6]], [[TMP20]]
+; CHECK-NEXT:    [[INC_6:%.*]] = add nsw i32 [[I_012]], 7
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[GEP_7]], align 4
+; CHECK-NEXT:    [[GEP9_7:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_6]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[GEP9_7]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_7]] = fadd float [[ADD3_6]], [[TMP23]]
+; CHECK-NEXT:    [[INC_7]] = add nsw i32 [[I_012]], 8
+; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp slt i32 [[INC_7]], [[SUB]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label [[FOR_COND_PREHEADER]], label [[FOR_BODY_7]]
+; CHECK:       for.body.clone:
+; CHECK-NEXT:    [[I_012_CLONE:%.*]] = phi i32 [ [[RESULT0_0_LCSSA]], [[FOR_COND_PREHEADER]] ], [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[RESULT_011_CLONE:%.*]] = phi float [ [[RESULT0_0_LCSSA8]], [[FOR_COND_PREHEADER]] ], [ [[ADD3_CLONE:%.*]], [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[GEP_CLONE:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[I_012_CLONE]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[GEP_CLONE]], align 4
+; CHECK-NEXT:    [[GEP9_CLONE:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[I_012_CLONE]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[GEP9_CLONE]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP24]], float [[TMP25]], float [[OFFSET]])
+; CHECK-NEXT:    [[ADD3_CLONE]] = fadd float [[RESULT_011_CLONE]], [[TMP26]]
+; CHECK-NEXT:    [[INC_CLONE]] = add nsw i32 [[I_012_CLONE]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[END_INDEX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[FOR_END]], label [[FOR_BODY_CLONE]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ [[ADD3_CLONE]], [[FOR_BODY_CLONE]] ], [ [[RESULT0_0_LCSSA8]], [[FOR_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[ADD64:%.*]] = fadd float [[RESULT0_0_LCSSA1]], [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    [[ADD65:%.*]] = fadd float [[RESULT0_0_LCSSA2]], [[RESULT0_0_LCSSA3]]
+; CHECK-NEXT:    [[ADD66:%.*]] = fadd float [[RESULT0_0_LCSSA4]], [[RESULT0_0_LCSSA5]]
+; CHECK-NEXT:    [[ADD67:%.*]] = fadd float [[RESULT0_0_LCSSA6]], [[RESULT0_0_LCSSA7]]
+; CHECK-NEXT:    [[ADD68:%.*]] = fadd float [[ADD65]], [[ADD64]]
+; CHECK-NEXT:    [[ADD69:%.*]] = fadd float [[ADD66]], [[ADD67]]
+; CHECK-NEXT:    [[ADD70:%.*]] = fadd float [[ADD69]], [[ADD68]]
+; CHECK-NEXT:    ret float [[ADD70]]
 ;
 entry:
   %invariant.gep = getelementptr float, ptr %data1, i32 %update1
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll
index 78ea995d2a297d..9922d9aa34f774 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll
@@ -1,41 +1,132 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_dotprode_f32_ansi(ptr nocapture noundef readonly %src1, ptr nocapture noundef readonly %src2, ptr nocapture noundef writeonly %dest, i32 noundef %len, i32 noundef %step1, i32 noundef %step2) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_dotprode_f32_ansi(
-; CHECK-SAME: ptr nocapture noundef readonly [[SRC1:%.*]], ptr nocapture noundef readonly [[SRC2:%.*]], ptr nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[SRC1:%.*]], ptr noalias nocapture noundef readonly [[SRC2:%.*]], ptr noalias nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_COND_PREHEADER1:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[LEN]], 0
-; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    [[CMP47110:%.*]] = icmp sgt i32 [[LEN]], 0
+; CHECK-NEXT:    br i1 [[CMP47110]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[ADD44:%.*]], [[FOR_END37:%.*]] ], [ [[TMP31:%.*]], [[FOR_BODY_CLONE]] ]
 ; CHECK-NEXT:    store float [[ACC_0_LCSSA]], ptr [[DEST]], align 4
 ; CHECK-NEXT:    ret i32 0
+; CHECK:       for.cond.preheader1:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[LEN]], -7
+; CHECK-NEXT:    [[CMP1113:%.*]] = icmp ugt i32 [[LEN]], 7
+; CHECK-NEXT:    br i1 [[CMP1113]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND31_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[LEN]], 2147483640
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond31.preheader:
+; CHECK-NEXT:    [[ACC0_0_LCSSA:%.*]] = phi float [ [[TMP4:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC1_0_LCSSA:%.*]] = phi float [ [[TMP7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC2_0_LCSSA:%.*]] = phi float [ [[TMP10:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC3_0_LCSSA:%.*]] = phi float [ [[TMP13:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC4_0_LCSSA:%.*]] = phi float [ [[TMP16:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC5_0_LCSSA:%.*]] = phi float [ [[TMP19:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC6_0_LCSSA:%.*]] = phi float [ [[TMP22:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[ACC7_0_LCSSA:%.*]] = phi float [ [[TMP25:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ]
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER1]] ], [ [[TMP1]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP32132:%.*]] = icmp slt i32 [[I_0_LCSSA]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP32132]], label [[FOR_BODY33:%.*]], label [[FOR_END37]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ACC_09:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_010]], [[STEP1]]
+; CHECK-NEXT:    [[I_0122:%.*]] = phi i32 [ [[ADD30:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC_09:%.*]] = phi float [ [[TMP4]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC1:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC2:%.*]] = phi float [ [[TMP10]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC3:%.*]] = phi float [ [[TMP13]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC4:%.*]] = phi float [ [[TMP16]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC5:%.*]] = phi float [ [[TMP19]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC6:%.*]] = phi float [ [[TMP22]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC7:%.*]] = phi float [ [[TMP25]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_0122]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i32 [[I_010]], [[STEP2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i32 [[I_0122]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[MUL1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[TMP3]] = tail call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP2]], float [[ACC_09]])
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[IF_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP4]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[ACC_09]])
+; CHECK-NEXT:    [[ADD1:%.*]] = or disjoint i32 [[I_0122]], 1
+; CHECK-NEXT:    [[ARRAYIDX1_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1_1]], align 4
+; CHECK-NEXT:    [[TMP7]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[ACC1]])
+; CHECK-NEXT:    [[ADD2:%.*]] = or disjoint i32 [[I_0122]], 2
+; CHECK-NEXT:    [[ARRAYIDX2_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[TMP10]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC2]])
+; CHECK-NEXT:    [[ADD3:%.*]] = or disjoint i32 [[I_0122]], 3
+; CHECK-NEXT:    [[ARRAYIDX3_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX3_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4
+; CHECK-NEXT:    [[TMP13]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[ACC3]])
+; CHECK-NEXT:    [[ADD4:%.*]] = or disjoint i32 [[I_0122]], 4
+; CHECK-NEXT:    [[ARRAYIDX4_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX4_1]], align 4
+; CHECK-NEXT:    [[TMP16]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[ACC4]])
+; CHECK-NEXT:    [[ADD5:%.*]] = or disjoint i32 [[I_0122]], 5
+; CHECK-NEXT:    [[ARRAYIDX5_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX5_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX5_1]], align 4
+; CHECK-NEXT:    [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP17]], float [[TMP18]], float [[ACC5]])
+; CHECK-NEXT:    [[ADD6:%.*]] = or disjoint i32 [[I_0122]], 6
+; CHECK-NEXT:    [[ARRAYIDX6_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6_1]], align 4
+; CHECK-NEXT:    [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC6]])
+; CHECK-NEXT:    [[ADD7:%.*]] = or disjoint i32 [[I_0122]], 7
+; CHECK-NEXT:    [[ARRAYIDX7_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX7_0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[ARRAYIDX7_1]], align 4
+; CHECK-NEXT:    [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP23]], float [[TMP24]], float [[ACC7]])
+; CHECK-NEXT:    [[ADD30]] = add nuw nsw i32 [[I_0122]], 8
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[ADD30]], [[SUB]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY]], label [[FOR_COND31_PREHEADER]]
+; CHECK:       for.body33:
+; CHECK-NEXT:    [[I_01033:%.*]] = phi i32 [ [[INC33:%.*]], [[FOR_BODY33]] ], [ [[I_0_LCSSA]], [[FOR_COND31_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC_0933:%.*]] = phi float [ [[TMP28:%.*]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ]
+; CHECK-NEXT:    [[MUL33:%.*]] = mul nsw i32 [[I_01033]], [[STEP1]]
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[MUL33]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    [[MUL133:%.*]] = mul nsw i32 [[I_01033]], [[STEP2]]
+; CHECK-NEXT:    [[ARRAYIDX233:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[MUL133]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[ARRAYIDX233]], align 4
+; CHECK-NEXT:    [[TMP28]] = tail call float @llvm.fmuladd.f32(float [[TMP26]], float [[TMP27]], float [[ACC_0933]])
+; CHECK-NEXT:    [[INC33]] = add nuw nsw i32 [[I_01033]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT33:%.*]] = icmp eq i32 [[INC33]], [[LEN]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT33]], label [[FOR_END37]], label [[FOR_BODY33]]
+; CHECK:       for.end37:
+; CHECK-NEXT:    [[ACC0_1_LCSSA:%.*]] = phi float [ [[TMP28]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM01:%.*]] = fadd float [[ACC1_0_LCSSA]], [[ACC0_1_LCSSA]]
+; CHECK-NEXT:    [[SUM23:%.*]] = fadd float [[ACC2_0_LCSSA]], [[ACC3_0_LCSSA]]
+; CHECK-NEXT:    [[SUM45:%.*]] = fadd float [[ACC4_0_LCSSA]], [[ACC5_0_LCSSA]]
+; CHECK-NEXT:    [[SUM67:%.*]] = fadd float [[ACC6_0_LCSSA]], [[ACC7_0_LCSSA]]
+; CHECK-NEXT:    [[SUM0123:%.*]] = fadd float [[SUM23]], [[SUM01]]
+; CHECK-NEXT:    [[SUM4567:%.*]] = fadd float [[SUM45]], [[SUM67]]
+; CHECK-NEXT:    [[ADD44]] = fadd float [[SUM4567]], [[SUM0123]]
+; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       for.body.clone:
 ; CHECK-NEXT:    [[I_010_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
-; CHECK-NEXT:    [[ACC_09_CLONE:%.*]] = phi float [ [[TMP6]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[ACC_09_CLONE:%.*]] = phi float [ [[TMP31]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[MUL_CLONE:%.*]] = mul nsw i32 [[I_010_CLONE]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[MUL_CLONE]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
 ; CHECK-NEXT:    [[MUL1_CLONE:%.*]] = mul nsw i32 [[I_010_CLONE]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX2_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[MUL1_CLONE]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX2_CLONE]], align 4
-; CHECK-NEXT:    [[TMP6]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[ACC_09_CLONE]])
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX2_CLONE]], align 4
+; CHECK-NEXT:    [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[ACC_09_CLONE]])
 ; CHECK-NEXT:    [[INC_CLONE]] = add nuw nsw i32 [[I_010_CLONE]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[IF_END]], label [[FOR_BODY_CLONE]]
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll
index 6a8cb4868b7ea6..61470c86fb2152 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll
@@ -1,28 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 %struct.fir_f32_s = type { ptr, ptr, i32, i32, i32, i16 }
 define dso_local noundef i32 @dsps_fir_f32_ansi(ptr nocapture noundef %fir, ptr nocapture noundef readonly %input, ptr nocapture noundef writeonly %output, i32 noundef %len) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_fir_f32_ansi(
-; CHECK-SAME: ptr nocapture noundef [[FIR:%.*]], ptr nocapture noundef readonly [[INPUT:%.*]], ptr nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias nocapture noundef [[FIR:%.*]], ptr noalias nocapture noundef readonly [[INPUT:%.*]], ptr noalias nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_COND_PREHEADER:%.*]], label [[FOR_BODY_LR_PH_CLONE:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_COND_PREHEADER:%.*]], label [[FOR_BODY_LR_PH_CLONE_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
-; CHECK-NEXT:    [[CMP67:%.*]] = icmp sgt i32 [[LEN]], 0
-; CHECK-NEXT:    br i1 [[CMP67]], label [[FOR_BODY_LR_PH:%.*]], label [[IF_END:%.*]]
-; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    [[DELAY:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S:%.*]], ptr [[FIR]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DELAY]], align 4
 ; CHECK-NEXT:    [[POS:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 3
 ; CHECK-NEXT:    [[N:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP2]], -7
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[POS]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    ret i32 0
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_BODY_LR_PH]] ], [ [[SPEC_STORE_SELECT:%.*]], [[FOR_COND_CLEANUP21:%.*]] ]
-; CHECK-NEXT:    [[I_068:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC33:%.*]], [[FOR_COND_CLEANUP21]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_COND_PREHEADER]] ], [ [[SPEC_STORE_SELECT:%.*]], [[FOR_END:%.*]] ]
+; CHECK-NEXT:    [[I_068:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER]] ], [ [[INC33_MODIFY:%.*]], [[FOR_END]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_068]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[TMP3]]
@@ -31,119 +27,286 @@ define dso_local noundef i32 @dsps_fir_f32_ansi(ptr nocapture noundef %fir, ptr
 ; CHECK-NEXT:    [[CMP4_NOT:%.*]] = icmp slt i32 [[INC]], [[TMP2]]
 ; CHECK-NEXT:    [[SPEC_STORE_SELECT]] = select i1 [[CMP4_NOT]], i32 [[INC]], i32 0
 ; CHECK-NEXT:    store i32 [[SPEC_STORE_SELECT]], ptr [[POS]], align 4
-; CHECK-NEXT:    [[CMP957:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[CMP957]], label [[FOR_BODY11_LR_PH:%.*]], label [[FOR_COND18_PREHEADER:%.*]]
+; CHECK-NEXT:    [[CMP957:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT]], [[SUB]]
+; CHECK-NEXT:    br i1 [[CMP957]], label [[FOR_BODY11_LR_PH_MODIFY:%.*]], label [[FOR_COND18_PREHEADER_MODIFY:%.*]]
+; CHECK:       for.cond18.preheader.modify:
+; CHECK-NEXT:    [[N_060_MODIFY_CLONE:%.*]] = phi i32 [ [[SPEC_STORE_SELECT]], [[FOR_BODY]] ], [ [[INC16_MODIFY:%.*]], [[FOR_BODY11_MODIFY:%.*]] ]
+; CHECK-NEXT:    [[COEFF_POS_059_MODIFY_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC12_MODIFY:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC_058_MODIFY_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC4_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP26:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC7_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP27:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC10_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP28:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC13_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP29:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC17_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP30:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC20_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP31:%.*]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[N_060_MODIFY_CLONE]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[CMP_SLT]], label [[FOR_BODY11_LR_PH:%.*]], label [[FOR_COND18_PREHEADER:%.*]]
 ; CHECK:       for.body11.lr.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[FIR]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP2]], [[SPEC_STORE_SELECT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[COEFF_POS_059_MODIFY_CLONE]]
 ; CHECK-NEXT:    br label [[FOR_BODY11:%.*]]
+; CHECK:       for.body11.lr.ph.modify:
+; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[FIR]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY11_MODIFY]]
+; CHECK:       for.body11.modify:
+; CHECK-NEXT:    [[N_060_MODIFY:%.*]] = phi i32 [ [[SPEC_STORE_SELECT]], [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[INC16_MODIFY]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[COEFF_POS_059_MODIFY:%.*]] = phi i32 [ 0, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[INC12_MODIFY]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC_058_MODIFY:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP24]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP25]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP26]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP27]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC10:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP28]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC13:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP29]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC17:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP30]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[ACC20:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP31]], [[FOR_BODY11_MODIFY]] ]
+; CHECK-NEXT:    [[INC12_MODIFY]] = add nuw nsw i32 [[COEFF_POS_059_MODIFY]], 8
+; CHECK-NEXT:    [[INC16_MODIFY]] = add nsw i32 [[N_060_MODIFY]], 8
+; CHECK-NEXT:    [[ADD7:%.*]] = or disjoint i32 [[COEFF_POS_059_MODIFY]], 7
+; CHECK-NEXT:    [[ARRAYIDX13_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[COEFF_POS_059_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX15_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N_060_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 3
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 3
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 5
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 5
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 6
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 6
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[ADD7]]
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX13_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX15_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC_058_MODIFY]])
+; CHECK-NEXT:    [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[ACC]])
+; CHECK-NEXT:    [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[ACC4]])
+; CHECK-NEXT:    [[TMP27]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[ACC7]])
+; CHECK-NEXT:    [[TMP28]] = tail call float @llvm.fmuladd.f32(float [[TMP16]], float [[TMP17]], float [[ACC10]])
+; CHECK-NEXT:    [[TMP29]] = tail call float @llvm.fmuladd.f32(float [[TMP18]], float [[TMP19]], float [[ACC13]])
+; CHECK-NEXT:    [[TMP30]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC17]])
+; CHECK-NEXT:    [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP22]], float [[TMP23]], float [[ACC20]])
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp slt i32 [[INC16_MODIFY]], [[SUB]]
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY11_MODIFY]], label [[FOR_COND18_PREHEADER_MODIFY]]
+; CHECK:       for.cond18.preheader.loopexit:
+; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP6]], [[N_060_MODIFY_CLONE]]
+; CHECK-NEXT:    br label [[FOR_COND18_PREHEADER]]
 ; CHECK:       for.cond18.preheader:
-; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP10:%.*]], [[FOR_BODY11]] ]
-; CHECK-NEXT:    [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP6]], [[FOR_BODY11]] ]
-; CHECK-NEXT:    [[CMP2062:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT]], 0
-; CHECK-NEXT:    br i1 [[CMP2062]], label [[FOR_BODY22_LR_PH:%.*]], label [[FOR_COND_CLEANUP21]]
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ [[ACC_058_MODIFY_CLONE]], [[FOR_COND18_PREHEADER_MODIFY]] ], [ [[TMP37:%.*]], [[FOR_COND18_PREHEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ [[COEFF_POS_059_MODIFY_CLONE]], [[FOR_COND18_PREHEADER_MODIFY]] ], [ [[TMP32]], [[FOR_COND18_PREHEADER_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = add nsw i32 [[SPEC_STORE_SELECT]], -7
+; CHECK-NEXT:    [[CMP2062:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT]], 7
+; CHECK-NEXT:    br i1 [[CMP2062]], label [[FOR_BODY22_LR_PH_MODIFY:%.*]], label [[FOR_COND_CLEANUP21:%.*]]
 ; CHECK:       for.body22.lr.ph:
-; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[FIR]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[FIR]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY22:%.*]]
 ; CHECK:       for.body11:
-; CHECK-NEXT:    [[N_060:%.*]] = phi i32 [ [[SPEC_STORE_SELECT]], [[FOR_BODY11_LR_PH]] ], [ [[INC16:%.*]], [[FOR_BODY11]] ]
-; CHECK-NEXT:    [[COEFF_POS_059:%.*]] = phi i32 [ 0, [[FOR_BODY11_LR_PH]] ], [ [[INC12:%.*]], [[FOR_BODY11]] ]
-; CHECK-NEXT:    [[ACC_058:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH]] ], [ [[TMP10]], [[FOR_BODY11]] ]
+; CHECK-NEXT:    [[N_060:%.*]] = phi i32 [ [[N_060_MODIFY_CLONE]], [[FOR_BODY11_LR_PH]] ], [ [[INC16:%.*]], [[FOR_BODY11]] ]
+; CHECK-NEXT:    [[COEFF_POS_059:%.*]] = phi i32 [ [[COEFF_POS_059_MODIFY_CLONE]], [[FOR_BODY11_LR_PH]] ], [ [[INC12:%.*]], [[FOR_BODY11]] ]
+; CHECK-NEXT:    [[ACC_058:%.*]] = phi float [ [[ACC_058_MODIFY_CLONE]], [[FOR_BODY11_LR_PH]] ], [ [[TMP37]], [[FOR_BODY11]] ]
 ; CHECK-NEXT:    [[INC12]] = add nuw i32 [[COEFF_POS_059]], 1
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 [[COEFF_POS_059]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N_060]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP10]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC_058]])
+; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP37]] = tail call float @llvm.fmuladd.f32(float [[TMP35]], float [[TMP36]], float [[ACC_058]])
 ; CHECK-NEXT:    [[INC16]] = add nsw i32 [[N_060]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC12]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND18_PREHEADER]], label [[FOR_BODY11]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC16]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND18_PREHEADER_LOOPEXIT]], label [[FOR_BODY11]]
 ; CHECK:       for.cond.cleanup21:
-; CHECK-NEXT:    [[ACC_1_LCSSA:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND18_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY22]] ]
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_068]]
-; CHECK-NEXT:    store float [[ACC_1_LCSSA]], ptr [[ARRAYIDX31]], align 4
-; CHECK-NEXT:    [[INC33]] = add nuw nsw i32 [[I_068]], 1
-; CHECK-NEXT:    [[EXITCOND71_NOT:%.*]] = icmp eq i32 [[INC33]], [[LEN]]
-; CHECK-NEXT:    br i1 [[EXITCOND71_NOT]], label [[IF_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[N17_065_MODIFY_CLONE:%.*]] = phi i32 [ 0, [[FOR_COND18_PREHEADER]] ], [ [[TMP39:%.*]], [[FOR_BODY22_MODIFY:%.*]] ]
+; CHECK-NEXT:    [[COEFF_POS_164_MODIFY_CLONE:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_COND18_PREHEADER]] ], [ [[INC24_MODIFY:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC_163_MODIFY_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND18_PREHEADER]] ], [ [[TMP56:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC23_CLONE:%.*]] = phi float [ [[ACC_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP57:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC27_CLONE:%.*]] = phi float [ [[ACC4_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP58:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC30_CLONE:%.*]] = phi float [ [[ACC7_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP59:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC34_CLONE:%.*]] = phi float [ [[ACC10_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP60:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC37_CLONE:%.*]] = phi float [ [[ACC13_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP61:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC40_CLONE:%.*]] = phi float [ [[ACC17_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP62:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC44_CLONE:%.*]] = phi float [ [[ACC20_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP63:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[CMP47:%.*]] = icmp slt i32 [[N17_065_MODIFY_CLONE]], [[SPEC_STORE_SELECT]]
+; CHECK-NEXT:    br i1 [[CMP47]], label [[FOR_BODY22_LR_PH:%.*]], label [[FOR_END]]
+; CHECK:       for.body22.lr.ph.modify:
+; CHECK-NEXT:    [[TMP38:%.*]] = load ptr, ptr [[FIR]], align 4
+; CHECK-NEXT:    [[TMP39]] = and i32 [[SPEC_STORE_SELECT]], 2147483640
+; CHECK-NEXT:    br label [[FOR_BODY22_MODIFY]]
+; CHECK:       for.body22.modify:
+; CHECK-NEXT:    [[N17_065_MODIFY:%.*]] = phi i32 [ 0, [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[INC29_MODIFY:%.*]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[COEFF_POS_164_MODIFY:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[INC24_MODIFY]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC_163_MODIFY:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP56]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC23:%.*]] = phi float [ [[ACC_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP57]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC27:%.*]] = phi float [ [[ACC4_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP58]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC30:%.*]] = phi float [ [[ACC7_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP59]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC34:%.*]] = phi float [ [[ACC10_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP60]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC37:%.*]] = phi float [ [[ACC13_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP61]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC40:%.*]] = phi float [ [[ACC17_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP62]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[ACC44:%.*]] = phi float [ [[ACC20_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP63]], [[FOR_BODY22_MODIFY]] ]
+; CHECK-NEXT:    [[INC24_MODIFY]] = add nuw nsw i32 [[COEFF_POS_164_MODIFY]], 8
+; CHECK-NEXT:    [[INC29_MODIFY]] = add nuw nsw i32 [[N17_065_MODIFY]], 8
+; CHECK-NEXT:    [[ADD1:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 1
+; CHECK-NEXT:    [[ADD2:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 2
+; CHECK-NEXT:    [[ADD3:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 3
+; CHECK-NEXT:    [[ADD4:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 4
+; CHECK-NEXT:    [[ADD5:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 5
+; CHECK-NEXT:    [[ADD6:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 6
+; CHECK-NEXT:    [[ADD743:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 7
+; CHECK-NEXT:    [[ARRAYIDX25_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP38]], i32 [[COEFF_POS_164_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX27_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N17_065_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD1]]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD2]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 3
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD3]]
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 4
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD4]]
+; CHECK-NEXT:    [[ARRAYIDX38:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 5
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD5]]
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 6
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD6]]
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 7
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD743]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load float, ptr [[ARRAYIDX25_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP41:%.*]] = load float, ptr [[ARRAYIDX27_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    [[TMP48:%.*]] = load float, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load float, ptr [[ARRAYIDX38]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[ARRAYIDX42]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load float, ptr [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
+; CHECK-NEXT:    [[TMP56]] = tail call float @llvm.fmuladd.f32(float [[TMP40]], float [[TMP41]], float [[ACC_163_MODIFY]])
+; CHECK-NEXT:    [[TMP57]] = tail call float @llvm.fmuladd.f32(float [[TMP42]], float [[TMP43]], float [[ACC23]])
+; CHECK-NEXT:    [[TMP58]] = tail call float @llvm.fmuladd.f32(float [[TMP44]], float [[TMP45]], float [[ACC27]])
+; CHECK-NEXT:    [[TMP59]] = tail call float @llvm.fmuladd.f32(float [[TMP46]], float [[TMP47]], float [[ACC30]])
+; CHECK-NEXT:    [[TMP60]] = tail call float @llvm.fmuladd.f32(float [[TMP48]], float [[TMP49]], float [[ACC34]])
+; CHECK-NEXT:    [[TMP61]] = tail call float @llvm.fmuladd.f32(float [[TMP50]], float [[TMP51]], float [[ACC37]])
+; CHECK-NEXT:    [[TMP62]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP53]], float [[ACC40]])
+; CHECK-NEXT:    [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP54]], float [[TMP55]], float [[ACC44]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC29_MODIFY]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY22_MODIFY]], label [[FOR_COND_CLEANUP21]]
 ; CHECK:       for.body22:
-; CHECK-NEXT:    [[N17_065:%.*]] = phi i32 [ 0, [[FOR_BODY22_LR_PH]] ], [ [[INC29:%.*]], [[FOR_BODY22]] ]
-; CHECK-NEXT:    [[COEFF_POS_164:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_BODY22_LR_PH]] ], [ [[INC24:%.*]], [[FOR_BODY22]] ]
-; CHECK-NEXT:    [[ACC_163:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY22_LR_PH]] ], [ [[TMP13]], [[FOR_BODY22]] ]
+; CHECK-NEXT:    [[N17_065:%.*]] = phi i32 [ [[N17_065_MODIFY_CLONE]], [[FOR_BODY22_LR_PH]] ], [ [[INC29:%.*]], [[FOR_BODY22]] ]
+; CHECK-NEXT:    [[COEFF_POS_164:%.*]] = phi i32 [ [[COEFF_POS_164_MODIFY_CLONE]], [[FOR_BODY22_LR_PH]] ], [ [[INC24:%.*]], [[FOR_BODY22]] ]
+; CHECK-NEXT:    [[ACC_163:%.*]] = phi float [ [[ACC_163_MODIFY_CLONE]], [[FOR_BODY22_LR_PH]] ], [ [[TMP66:%.*]], [[FOR_BODY22]] ]
 ; CHECK-NEXT:    [[INC24]] = add nuw nsw i32 [[COEFF_POS_164]], 1
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[COEFF_POS_164]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 [[COEFF_POS_164]]
+; CHECK-NEXT:    [[TMP64:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N17_065]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[TMP13]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[ACC_163]])
+; CHECK-NEXT:    [[TMP65:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[ACC_163]])
 ; CHECK-NEXT:    [[INC29]] = add nuw nsw i32 [[N17_065]], 1
 ; CHECK-NEXT:    [[EXITCOND70_NOT:%.*]] = icmp eq i32 [[INC29]], [[SPEC_STORE_SELECT]]
-; CHECK-NEXT:    br i1 [[EXITCOND70_NOT]], label [[FOR_COND_CLEANUP21]], label [[FOR_BODY22]]
+; CHECK-NEXT:    br i1 [[EXITCOND70_NOT]], label [[FOR_END]], label [[FOR_BODY22]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP67:%.*]] = phi float [ [[ACC_163_MODIFY_CLONE]], [[FOR_COND_CLEANUP21]] ], [ [[TMP66]], [[FOR_BODY22]] ]
+; CHECK-NEXT:    [[ADD139:%.*]] = fadd float [[TMP67]], [[ACC23_CLONE]]
+; CHECK-NEXT:    [[ADD140:%.*]] = fadd float [[ACC27_CLONE]], [[ACC30_CLONE]]
+; CHECK-NEXT:    [[ADD141:%.*]] = fadd float [[ACC34_CLONE]], [[ACC37_CLONE]]
+; CHECK-NEXT:    [[ADD142:%.*]] = fadd float [[ACC40_CLONE]], [[ACC44_CLONE]]
+; CHECK-NEXT:    [[ADD143:%.*]] = fadd float [[ADD139]], [[ADD140]]
+; CHECK-NEXT:    [[ADD144:%.*]] = fadd float [[ADD141]], [[ADD142]]
+; CHECK-NEXT:    [[ADD145:%.*]] = fadd float [[ADD143]], [[ADD144]]
+; CHECK-NEXT:    [[ARRAYIDX31_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_068]]
+; CHECK-NEXT:    store float [[ADD145]], ptr [[ARRAYIDX31_MODIFY]], align 4
+; CHECK-NEXT:    [[INC33_MODIFY]] = add nuw nsw i32 [[I_068]], 1
+; CHECK-NEXT:    [[EXITCOND71_NOT_MODIFY:%.*]] = icmp eq i32 [[INC33_MODIFY]], [[LEN]]
+; CHECK-NEXT:    br i1 [[EXITCOND71_NOT_MODIFY]], label [[IF_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body.lr.ph.clone.preheader:
+; CHECK-NEXT:    [[CMP151349:%.*]] = icmp sgt i32 [[LEN]], 0
+; CHECK-NEXT:    br i1 [[CMP151349]], label [[FOR_BODY_LR_PH_CLONE:%.*]], label [[IF_END]]
 ; CHECK:       for.body.lr.ph.clone:
 ; CHECK-NEXT:    [[DELAY_CLONE:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DELAY_CLONE]], align 4
+; CHECK-NEXT:    [[TMP68:%.*]] = load ptr, ptr [[DELAY_CLONE]], align 4
 ; CHECK-NEXT:    [[POS_CLONE:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 3
 ; CHECK-NEXT:    [[N_CLONE:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_CLONE]], align 4
+; CHECK-NEXT:    [[TMP69:%.*]] = load i32, ptr [[N_CLONE]], align 4
 ; CHECK-NEXT:    [[DOTPRE_CLONE:%.*]] = load i32, ptr [[POS_CLONE]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY_CLONE:%.*]]
 ; CHECK:       for.body.clone:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi i32 [ [[DOTPRE_CLONE]], [[FOR_BODY_LR_PH_CLONE]] ], [ [[SPEC_STORE_SELECT_CLONE:%.*]], [[FOR_COND_CLEANUP21_CLONE:%.*]] ]
+; CHECK-NEXT:    [[TMP70:%.*]] = phi i32 [ [[DOTPRE_CLONE]], [[FOR_BODY_LR_PH_CLONE]] ], [ [[SPEC_STORE_SELECT_CLONE:%.*]], [[FOR_COND_CLEANUP21_CLONE:%.*]] ]
 ; CHECK-NEXT:    [[I_068_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH_CLONE]] ], [ [[INC33_CLONE:%.*]], [[FOR_COND_CLEANUP21_CLONE]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_068_CLONE]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP16]]
-; CHECK-NEXT:    store float [[TMP17]], ptr [[ARRAYIDX1_CLONE]], align 4
-; CHECK-NEXT:    [[INC_CLONE:%.*]] = add nsw i32 [[TMP16]], 1
-; CHECK-NEXT:    [[CMP4_NOT_CLONE:%.*]] = icmp slt i32 [[INC_CLONE]], [[TMP15]]
+; CHECK-NEXT:    [[TMP71:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 [[TMP70]]
+; CHECK-NEXT:    store float [[TMP71]], ptr [[ARRAYIDX1_CLONE]], align 4
+; CHECK-NEXT:    [[INC_CLONE:%.*]] = add nsw i32 [[TMP70]], 1
+; CHECK-NEXT:    [[CMP4_NOT_CLONE:%.*]] = icmp slt i32 [[INC_CLONE]], [[TMP69]]
 ; CHECK-NEXT:    [[SPEC_STORE_SELECT_CLONE]] = select i1 [[CMP4_NOT_CLONE]], i32 [[INC_CLONE]], i32 0
 ; CHECK-NEXT:    store i32 [[SPEC_STORE_SELECT_CLONE]], ptr [[POS_CLONE]], align 4
-; CHECK-NEXT:    [[CMP957_CLONE:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT_CLONE]], [[TMP15]]
+; CHECK-NEXT:    [[CMP957_CLONE:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT_CLONE]], [[TMP69]]
 ; CHECK-NEXT:    br i1 [[CMP957_CLONE]], label [[FOR_BODY11_LR_PH_CLONE:%.*]], label [[FOR_COND18_PREHEADER_CLONE:%.*]]
 ; CHECK:       for.body11.lr.ph.clone:
-; CHECK-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[FIR]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP15]], [[SPEC_STORE_SELECT_CLONE]]
+; CHECK-NEXT:    [[TMP72:%.*]] = load ptr, ptr [[FIR]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = sub i32 [[TMP69]], [[SPEC_STORE_SELECT_CLONE]]
 ; CHECK-NEXT:    br label [[FOR_BODY11_CLONE:%.*]]
 ; CHECK:       for.body11.clone:
 ; CHECK-NEXT:    [[N_060_CLONE:%.*]] = phi i32 [ [[SPEC_STORE_SELECT_CLONE]], [[FOR_BODY11_LR_PH_CLONE]] ], [ [[INC16_CLONE:%.*]], [[FOR_BODY11_CLONE]] ]
 ; CHECK-NEXT:    [[COEFF_POS_059_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY11_LR_PH_CLONE]] ], [ [[INC12_CLONE:%.*]], [[FOR_BODY11_CLONE]] ]
-; CHECK-NEXT:    [[ACC_058_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_CLONE]] ], [ [[TMP22:%.*]], [[FOR_BODY11_CLONE]] ]
+; CHECK-NEXT:    [[ACC_058_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_CLONE]] ], [ [[TMP76:%.*]], [[FOR_BODY11_CLONE]] ]
 ; CHECK-NEXT:    [[INC12_CLONE]] = add nuw i32 [[COEFF_POS_059_CLONE]], 1
-; CHECK-NEXT:    [[ARRAYIDX13_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 [[COEFF_POS_059_CLONE]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX13_CLONE]], align 4
-; CHECK-NEXT:    [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[N_060_CLONE]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4
-; CHECK-NEXT:    [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC_058_CLONE]])
+; CHECK-NEXT:    [[ARRAYIDX13_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP72]], i32 [[COEFF_POS_059_CLONE]]
+; CHECK-NEXT:    [[TMP74:%.*]] = load float, ptr [[ARRAYIDX13_CLONE]], align 4
+; CHECK-NEXT:    [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 [[N_060_CLONE]]
+; CHECK-NEXT:    [[TMP75:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4
+; CHECK-NEXT:    [[TMP76]] = tail call float @llvm.fmuladd.f32(float [[TMP74]], float [[TMP75]], float [[ACC_058_CLONE]])
 ; CHECK-NEXT:    [[INC16_CLONE]] = add nsw i32 [[N_060_CLONE]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC12_CLONE]], [[TMP19]]
+; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC12_CLONE]], [[TMP73]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[FOR_COND18_PREHEADER_CLONE]], label [[FOR_BODY11_CLONE]]
 ; CHECK:       for.cond18.preheader.clone:
-; CHECK-NEXT:    [[ACC_0_LCSSA_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_CLONE]] ], [ [[TMP22]], [[FOR_BODY11_CLONE]] ]
-; CHECK-NEXT:    [[COEFF_POS_0_LCSSA_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY_CLONE]] ], [ [[TMP19]], [[FOR_BODY11_CLONE]] ]
+; CHECK-NEXT:    [[ACC_0_LCSSA_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_CLONE]] ], [ [[TMP76]], [[FOR_BODY11_CLONE]] ]
+; CHECK-NEXT:    [[COEFF_POS_0_LCSSA_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY_CLONE]] ], [ [[TMP73]], [[FOR_BODY11_CLONE]] ]
 ; CHECK-NEXT:    [[CMP2062_CLONE:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT_CLONE]], 0
 ; CHECK-NEXT:    br i1 [[CMP2062_CLONE]], label [[FOR_BODY22_LR_PH_CLONE:%.*]], label [[FOR_COND_CLEANUP21_CLONE]]
 ; CHECK:       for.body22.lr.ph.clone:
-; CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[FIR]], align 4
+; CHECK-NEXT:    [[TMP77:%.*]] = load ptr, ptr [[FIR]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY22_CLONE:%.*]]
 ; CHECK:       for.body22.clone:
 ; CHECK-NEXT:    [[N17_065_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY22_LR_PH_CLONE]] ], [ [[INC29_CLONE:%.*]], [[FOR_BODY22_CLONE]] ]
 ; CHECK-NEXT:    [[COEFF_POS_164_CLONE:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA_CLONE]], [[FOR_BODY22_LR_PH_CLONE]] ], [ [[INC24_CLONE:%.*]], [[FOR_BODY22_CLONE]] ]
-; CHECK-NEXT:    [[ACC_163_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_BODY22_LR_PH_CLONE]] ], [ [[TMP26:%.*]], [[FOR_BODY22_CLONE]] ]
+; CHECK-NEXT:    [[ACC_163_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_BODY22_LR_PH_CLONE]] ], [ [[TMP80:%.*]], [[FOR_BODY22_CLONE]] ]
 ; CHECK-NEXT:    [[INC24_CLONE]] = add nuw nsw i32 [[COEFF_POS_164_CLONE]], 1
-; CHECK-NEXT:    [[ARRAYIDX25_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[COEFF_POS_164_CLONE]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[ARRAYIDX25_CLONE]], align 4
-; CHECK-NEXT:    [[ARRAYIDX27_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[N17_065_CLONE]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[ARRAYIDX27_CLONE]], align 4
-; CHECK-NEXT:    [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP24]], float [[TMP25]], float [[ACC_163_CLONE]])
+; CHECK-NEXT:    [[ARRAYIDX25_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 [[COEFF_POS_164_CLONE]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load float, ptr [[ARRAYIDX25_CLONE]], align 4
+; CHECK-NEXT:    [[ARRAYIDX27_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 [[N17_065_CLONE]]
+; CHECK-NEXT:    [[TMP79:%.*]] = load float, ptr [[ARRAYIDX27_CLONE]], align 4
+; CHECK-NEXT:    [[TMP80]] = tail call float @llvm.fmuladd.f32(float [[TMP78]], float [[TMP79]], float [[ACC_163_CLONE]])
 ; CHECK-NEXT:    [[INC29_CLONE]] = add nuw nsw i32 [[N17_065_CLONE]], 1
 ; CHECK-NEXT:    [[EXITCOND70_NOT_CLONE:%.*]] = icmp eq i32 [[INC29_CLONE]], [[SPEC_STORE_SELECT_CLONE]]
 ; CHECK-NEXT:    br i1 [[EXITCOND70_NOT_CLONE]], label [[FOR_COND_CLEANUP21_CLONE]], label [[FOR_BODY22_CLONE]]
 ; CHECK:       for.cond.cleanup21.clone:
-; CHECK-NEXT:    [[ACC_1_LCSSA_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_COND18_PREHEADER_CLONE]] ], [ [[TMP26]], [[FOR_BODY22_CLONE]] ]
+; CHECK-NEXT:    [[ACC_1_LCSSA_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_COND18_PREHEADER_CLONE]] ], [ [[TMP80]], [[FOR_BODY22_CLONE]] ]
 ; CHECK-NEXT:    [[ARRAYIDX31_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_068_CLONE]]
 ; CHECK-NEXT:    store float [[ACC_1_LCSSA_CLONE]], ptr [[ARRAYIDX31_CLONE]], align 4
 ; CHECK-NEXT:    [[INC33_CLONE]] = add nuw nsw i32 [[I_068_CLONE]], 1
 ; CHECK-NEXT:    [[EXITCOND71_NOT_CLONE:%.*]] = icmp eq i32 [[INC33_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND71_NOT_CLONE]], label [[IF_END]], label [[FOR_BODY_CLONE]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 0
 ;
 entry:
   %0 = icmp sgt i32 %len, 2
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll
index 875710cf61b86c..e7a15e8558512f 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 %struct.fir_f32_s = type { ptr, ptr, i32, i32, i32, i16 }
 define dso_local noundef i32 @dsps_fird_f32_ansi(ptr nocapture noundef %fir, ptr nocapture noundef readonly %input, ptr nocapture noundef writeonly %output, i32 noundef %len) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_fird_f32_ansi(
-; CHECK-SAME: ptr nocapture noundef [[FIR:%.*]], ptr nocapture noundef readonly [[INPUT:%.*]], ptr nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias nocapture noundef [[FIR:%.*]], ptr noalias nocapture noundef readonly [[INPUT:%.*]], ptr noalias nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP77:%.*]] = icmp sgt i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[CMP77]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -18,32 +18,57 @@ define dso_local noundef i32 @dsps_fird_f32_ansi(ptr nocapture noundef %fir, ptr
 ; CHECK-NEXT:    [[POS9_PROMOTED:%.*]] = load i32, ptr [[POS]], align 4
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[POS9_PROMOTED]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[TMP4:%.*]], [[FOR_COND_CLEANUP26:%.*]] ]
-; CHECK-NEXT:    [[I_080:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC39:%.*]], [[FOR_COND_CLEANUP26]] ]
-; CHECK-NEXT:    [[INPUT_ADDR_078:%.*]] = phi ptr [ [[INPUT]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INPUT_ADDR_1_LCSSA:%.*]], [[FOR_COND_CLEANUP26]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[POS9_PROMOTED]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[TMP4:%.*]], [[FOR_END141:%.*]] ]
+; CHECK-NEXT:    [[I_080:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC152:%.*]], [[FOR_END141]] ]
+; CHECK-NEXT:    [[INPUT_ADDR_078:%.*]] = phi ptr [ [[INPUT]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INPUT_ADDR_1_LCSSA:%.*]], [[FOR_END141]] ]
 ; CHECK-NEXT:    br i1 [[CMP263]], label [[FOR_BODY4_LR_PH:%.*]], label [[FOR_COND_CLEANUP3:%.*]]
 ; CHECK:       for.body4.lr.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DELAY]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[LEN]], [[FOR_COND_CLEANUP26]] ]
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[LEN]], [[FOR_END141]] ]
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge:
-; CHECK-NEXT:    store i32 [[SPEC_SELECT:%.*]], ptr [[POS]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    store i32 [[SPEC_SELECT_LCSSA]], ptr [[POS]], align 4
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP3]]
 ; CHECK:       for.cond.cleanup3:
-; CHECK-NEXT:    [[TMP4]] = phi i32 [ [[SPEC_SELECT]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE:%.*]] ], [ [[TMP2]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[INPUT_ADDR_1_LCSSA]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE]] ], [ [[INPUT_ADDR_078]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[CMP1266:%.*]] = icmp slt i32 [[TMP4]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[CMP1266]], label [[FOR_BODY14_LR_PH:%.*]], label [[FOR_COND23_PREHEADER:%.*]]
+; CHECK-NEXT:    [[TMP4]] = phi i32 [ [[SPEC_SELECT_LCSSA]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE:%.*]] ], [ [[TMP2]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[INPUT_ADDR_1_LCSSA]] = phi ptr [ [[INCDEC_PTR_LCSSA]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE]] ], [ [[INPUT_ADDR_078]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[ADD269:%.*]] = add nsw i32 [[TMP4]], 8
+; CHECK-NEXT:    [[CMP1266:%.*]] = icmp sgt i32 [[ADD269]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP1266]], label [[FOR_COND63_PREHEADER:%.*]], label [[FOR_BODY14_LR_PH:%.*]]
 ; CHECK:       for.body14.lr.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[FIR]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DELAY]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    br label [[FOR_BODY14:%.*]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP6]], i32 28
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i32 [[TMP4]], 2
+; CHECK-NEXT:    [[SCEVGEP101:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP105:%.*]] = getelementptr i8, ptr [[TMP6]], i32 24
+; CHECK-NEXT:    [[SCEVGEP106:%.*]] = getelementptr i8, ptr [[SCEVGEP105]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP108:%.*]] = getelementptr i8, ptr [[TMP6]], i32 20
+; CHECK-NEXT:    [[SCEVGEP109:%.*]] = getelementptr i8, ptr [[SCEVGEP108]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP111:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NEXT:    [[SCEVGEP112:%.*]] = getelementptr i8, ptr [[SCEVGEP111]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP114:%.*]] = getelementptr i8, ptr [[TMP6]], i32 12
+; CHECK-NEXT:    [[SCEVGEP115:%.*]] = getelementptr i8, ptr [[SCEVGEP114]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP117:%.*]] = getelementptr i8, ptr [[TMP6]], i32 8
+; CHECK-NEXT:    [[SCEVGEP118:%.*]] = getelementptr i8, ptr [[SCEVGEP117]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP120:%.*]] = getelementptr i8, ptr [[TMP6]], i32 4
+; CHECK-NEXT:    [[SCEVGEP121:%.*]] = getelementptr i8, ptr [[SCEVGEP120]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP123:%.*]] = getelementptr i8, ptr [[TMP6]], i32 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP127:%.*]] = getelementptr i8, ptr [[TMP5]], i32 28
+; CHECK-NEXT:    [[SCEVGEP129:%.*]] = getelementptr i8, ptr [[TMP5]], i32 24
+; CHECK-NEXT:    [[SCEVGEP131:%.*]] = getelementptr i8, ptr [[TMP5]], i32 20
+; CHECK-NEXT:    [[SCEVGEP133:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
+; CHECK-NEXT:    [[SCEVGEP135:%.*]] = getelementptr i8, ptr [[TMP5]], i32 12
+; CHECK-NEXT:    [[SCEVGEP137:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; CHECK-NEXT:    [[SCEVGEP139:%.*]] = getelementptr i8, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    br label [[FOR_BODY14_7:%.*]]
 ; CHECK:       for.body4:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY4]] ], [ [[TMP0]], [[FOR_BODY4_LR_PH]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[TMP2]], [[FOR_BODY4_LR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY4]] ]
-; CHECK-NEXT:    [[K_065:%.*]] = phi i32 [ 0, [[FOR_BODY4_LR_PH]] ], [ [[INC8:%.*]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[INPUT_ADDR_164:%.*]] = phi ptr [ [[INPUT_ADDR_078]], [[FOR_BODY4_LR_PH]] ], [ [[INCDEC_PTR]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[INPUT_ADDR_164]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[INPUT_ADDR_164]], align 4
@@ -52,51 +77,247 @@ define dso_local noundef i32 @dsps_fird_f32_ansi(ptr nocapture noundef %fir, ptr
 ; CHECK-NEXT:    store float [[TMP9]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[CMP6_NOT:%.*]] = icmp slt i32 [[INC]], [[TMP1]]
 ; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP6_NOT]], i32 [[INC]], i32 0
-; CHECK-NEXT:    [[INC8]] = add nuw nsw i32 [[K_065]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC8]], [[TMP0]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE]], label [[FOR_BODY4]]
-; CHECK:       for.cond23.preheader:
-; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP14:%.*]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP3]] ], [ [[TMP7]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[CMP2572:%.*]] = icmp sgt i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[CMP2572]], label [[FOR_BODY27_LR_PH:%.*]], label [[FOR_COND_CLEANUP26]]
+; CHECK:       for.cond63.preheader:
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP18:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC_1_LCSSA2:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP21:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC_2_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP24:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC_3_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP27:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC_4_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP30:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC_5_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP33:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC_6_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP36:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC_7_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP39:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP3]] ], [ [[LSR_IV_NEXT126:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[N_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[FOR_COND_CLEANUP3]] ], [ [[LSR_IV_NEXT100:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[CMP2572:%.*]] = icmp slt i32 [[N_0_LCSSA]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP2572]], label [[FOR_BODY27_LR_PH:%.*]], label [[FOR_COND_CLEANUP26:%.*]]
 ; CHECK:       for.body27.lr.ph:
 ; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[FIR]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DELAY]], align 4
-; CHECK-NEXT:    br label [[FOR_BODY27:%.*]]
-; CHECK:       for.body14:
-; CHECK-NEXT:    [[N_069:%.*]] = phi i32 [ [[TMP4]], [[FOR_BODY14_LR_PH]] ], [ [[INC20:%.*]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[COEFF_POS_068:%.*]] = phi i32 [ 0, [[FOR_BODY14_LR_PH]] ], [ [[INC15:%.*]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[ACC_067:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP14]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[INC15]] = add nuw i32 [[COEFF_POS_068]], 1
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 [[COEFF_POS_068]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 [[N_069]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[TMP14]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[ACC_067]])
-; CHECK-NEXT:    [[INC20]] = add nsw i32 [[N_069]], 1
-; CHECK-NEXT:    [[EXITCOND83_NOT:%.*]] = icmp eq i32 [[INC15]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[EXITCOND83_NOT]], label [[FOR_COND23_PREHEADER]], label [[FOR_BODY14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[COEFF_POS_0_LCSSA]], [[TMP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP1]], [[N_0_LCSSA]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl i32 [[N_0_LCSSA]], 2
+; CHECK-NEXT:    [[SCEVGEP144:%.*]] = getelementptr i8, ptr [[TMP11]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shl i32 [[COEFF_POS_0_LCSSA]], 2
+; CHECK-NEXT:    [[SCEVGEP147:%.*]] = getelementptr i8, ptr [[TMP10]], i32 [[TMP15]]
+; CHECK-NEXT:    br label [[FOR_BODY14_CLONE:%.*]]
+; CHECK:       for.body14.7:
+; CHECK-NEXT:    [[LSR_IV125:%.*]] = phi i32 [ 0, [[FOR_BODY14_LR_PH]] ], [ [[LSR_IV_NEXT126]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[LSR_IV102:%.*]] = phi i32 [ 0, [[FOR_BODY14_LR_PH]] ], [ [[LSR_IV_NEXT103:%.*]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[LSR_IV99:%.*]] = phi i32 [ [[TMP4]], [[FOR_BODY14_LR_PH]] ], [ [[LSR_IV_NEXT100]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP18]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC3:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP21]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP24]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC5:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP27]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP30]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP33]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC8:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP36]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[ACC9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP39]], [[FOR_BODY14_7]] ]
+; CHECK-NEXT:    [[SCEVGEP141:%.*]] = getelementptr i8, ptr [[TMP5]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[SCEVGEP141]], align 4
+; CHECK-NEXT:    [[SCEVGEP124:%.*]] = getelementptr i8, ptr [[SCEVGEP123]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[SCEVGEP124]], align 4
+; CHECK-NEXT:    [[TMP18]] = tail call float @llvm.fmuladd.f32(float [[TMP16]], float [[TMP17]], float [[ACC]])
+; CHECK-NEXT:    [[SCEVGEP140:%.*]] = getelementptr i8, ptr [[SCEVGEP139]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[SCEVGEP140]], align 4
+; CHECK-NEXT:    [[SCEVGEP122:%.*]] = getelementptr i8, ptr [[SCEVGEP121]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[SCEVGEP122]], align 4
+; CHECK-NEXT:    [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP19]], float [[TMP20]], float [[ACC3]])
+; CHECK-NEXT:    [[SCEVGEP138:%.*]] = getelementptr i8, ptr [[SCEVGEP137]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[SCEVGEP138]], align 4
+; CHECK-NEXT:    [[SCEVGEP119:%.*]] = getelementptr i8, ptr [[SCEVGEP118]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[SCEVGEP119]], align 4
+; CHECK-NEXT:    [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP22]], float [[TMP23]], float [[ACC4]])
+; CHECK-NEXT:    [[SCEVGEP136:%.*]] = getelementptr i8, ptr [[SCEVGEP135]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[SCEVGEP136]], align 4
+; CHECK-NEXT:    [[SCEVGEP116:%.*]] = getelementptr i8, ptr [[SCEVGEP115]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[SCEVGEP116]], align 4
+; CHECK-NEXT:    [[TMP27]] = tail call float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[ACC5]])
+; CHECK-NEXT:    [[SCEVGEP134:%.*]] = getelementptr i8, ptr [[SCEVGEP133]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load float, ptr [[SCEVGEP134]], align 4
+; CHECK-NEXT:    [[SCEVGEP113:%.*]] = getelementptr i8, ptr [[SCEVGEP112]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[SCEVGEP113]], align 4
+; CHECK-NEXT:    [[TMP30]] = tail call float @llvm.fmuladd.f32(float [[TMP28]], float [[TMP29]], float [[ACC6]])
+; CHECK-NEXT:    [[SCEVGEP132:%.*]] = getelementptr i8, ptr [[SCEVGEP131]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, ptr [[SCEVGEP132]], align 4
+; CHECK-NEXT:    [[SCEVGEP110:%.*]] = getelementptr i8, ptr [[SCEVGEP109]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[SCEVGEP110]], align 4
+; CHECK-NEXT:    [[TMP33]] = tail call float @llvm.fmuladd.f32(float [[TMP31]], float [[TMP32]], float [[ACC7]])
+; CHECK-NEXT:    [[SCEVGEP130:%.*]] = getelementptr i8, ptr [[SCEVGEP129]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[SCEVGEP130]], align 4
+; CHECK-NEXT:    [[SCEVGEP107:%.*]] = getelementptr i8, ptr [[SCEVGEP106]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[SCEVGEP107]], align 4
+; CHECK-NEXT:    [[TMP36]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[ACC8]])
+; CHECK-NEXT:    [[SCEVGEP128:%.*]] = getelementptr i8, ptr [[SCEVGEP127]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[SCEVGEP128]], align 4
+; CHECK-NEXT:    [[SCEVGEP104:%.*]] = getelementptr i8, ptr [[SCEVGEP101]], i32 [[LSR_IV102]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[SCEVGEP104]], align 4
+; CHECK-NEXT:    [[TMP39]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[ACC9]])
+; CHECK-NEXT:    [[LSR_IV_NEXT100]] = add nsw i32 [[LSR_IV99]], 8
+; CHECK-NEXT:    [[TMP40:%.*]] = add i32 [[LSR_IV_NEXT100]], 8
+; CHECK-NEXT:    [[LSR_IV_NEXT103]] = add nuw i32 [[LSR_IV102]], 32
+; CHECK-NEXT:    [[LSR_IV_NEXT126]] = add nuw i32 [[LSR_IV125]], 8
+; CHECK-NEXT:    [[EXITCOND83_NOT_7:%.*]] = icmp sgt i32 [[TMP40]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND83_NOT_7]], label [[FOR_COND63_PREHEADER]], label [[FOR_BODY14_7]]
+; CHECK:       for.body79.lr.ph:
+; CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[FIR]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[DELAY]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = and i32 [[TMP4]], 2147483640
+; CHECK-NEXT:    [[SCEVGEP150:%.*]] = getelementptr i8, ptr [[TMP42]], i32 28
+; CHECK-NEXT:    [[SCEVGEP154:%.*]] = getelementptr i8, ptr [[TMP42]], i32 24
+; CHECK-NEXT:    [[SCEVGEP156:%.*]] = getelementptr i8, ptr [[TMP42]], i32 20
+; CHECK-NEXT:    [[SCEVGEP158:%.*]] = getelementptr i8, ptr [[TMP42]], i32 16
+; CHECK-NEXT:    [[SCEVGEP160:%.*]] = getelementptr i8, ptr [[TMP42]], i32 12
+; CHECK-NEXT:    [[SCEVGEP162:%.*]] = getelementptr i8, ptr [[TMP42]], i32 8
+; CHECK-NEXT:    [[SCEVGEP164:%.*]] = getelementptr i8, ptr [[TMP42]], i32 4
+; CHECK-NEXT:    [[SCEVGEP169:%.*]] = getelementptr i8, ptr [[TMP41]], i32 28
+; CHECK-NEXT:    [[TMP44:%.*]] = shl i32 [[COEFF_POS_1_LCSSA:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP170:%.*]] = getelementptr i8, ptr [[SCEVGEP169]], i32 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP172:%.*]] = getelementptr i8, ptr [[TMP41]], i32 24
+; CHECK-NEXT:    [[SCEVGEP173:%.*]] = getelementptr i8, ptr [[SCEVGEP172]], i32 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP175:%.*]] = getelementptr i8, ptr [[TMP41]], i32 20
+; CHECK-NEXT:    [[SCEVGEP176:%.*]] = getelementptr i8, ptr [[SCEVGEP175]], i32 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP178:%.*]] = getelementptr i8, ptr [[TMP41]], i32 16
+; CHECK-NEXT:    [[SCEVGEP179:%.*]] = getelementptr i8, ptr [[SCEVGEP178]], i32 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP181:%.*]] = getelementptr i8, ptr [[TMP41]], i32 12
+; CHECK-NEXT:    [[SCEVGEP182:%.*]] = getelementptr i8, ptr [[SCEVGEP181]], i32 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP184:%.*]] = getelementptr i8, ptr [[TMP41]], i32 8
+; CHECK-NEXT:    [[SCEVGEP185:%.*]] = getelementptr i8, ptr [[SCEVGEP184]], i32 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP187:%.*]] = getelementptr i8, ptr [[TMP41]], i32 4
+; CHECK-NEXT:    [[SCEVGEP188:%.*]] = getelementptr i8, ptr [[SCEVGEP187]], i32 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP190:%.*]] = getelementptr i8, ptr [[TMP41]], i32 [[TMP44]]
+; CHECK-NEXT:    br label [[FOR_BODY27_7:%.*]]
+; CHECK:       for.body14.clone:
+; CHECK-NEXT:    [[LSR_IV148:%.*]] = phi ptr [ [[SCEVGEP149:%.*]], [[FOR_BODY14_CLONE]] ], [ [[SCEVGEP147]], [[FOR_BODY27_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV145:%.*]] = phi ptr [ [[SCEVGEP146:%.*]], [[FOR_BODY14_CLONE]] ], [ [[SCEVGEP144]], [[FOR_BODY27_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV142:%.*]] = phi i32 [ [[LSR_IV_NEXT143:%.*]], [[FOR_BODY14_CLONE]] ], [ [[TMP13]], [[FOR_BODY27_LR_PH]] ]
+; CHECK-NEXT:    [[ACC_067_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY27_LR_PH]] ], [ [[TMP47:%.*]], [[FOR_BODY14_CLONE]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[LSR_IV148]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, ptr [[LSR_IV145]], align 4
+; CHECK-NEXT:    [[TMP47]] = tail call float @llvm.fmuladd.f32(float [[TMP45]], float [[TMP46]], float [[ACC_067_CLONE]])
+; CHECK-NEXT:    [[LSR_IV_NEXT143]] = add i32 [[LSR_IV142]], -1
+; CHECK-NEXT:    [[SCEVGEP146]] = getelementptr i8, ptr [[LSR_IV145]], i32 4
+; CHECK-NEXT:    [[SCEVGEP149]] = getelementptr i8, ptr [[LSR_IV148]], i32 4
+; CHECK-NEXT:    [[EXITCOND83_NOT_CLONE:%.*]] = icmp eq i32 [[LSR_IV_NEXT143]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND83_NOT_CLONE]], label [[FOR_COND_CLEANUP26_LOOPEXIT:%.*]], label [[FOR_BODY14_CLONE]]
+; CHECK:       for.cond130.preheader:
+; CHECK-NEXT:    [[ACC_0_LCSSA_CLONE:%.*]] = phi float [ [[ACC_1_LCSSA:%.*]], [[FOR_COND_CLEANUP26]] ], [ [[TMP51:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC_1_LCSSA2_CLONE:%.*]] = phi float [ [[ACC_1_LCSSA2]], [[FOR_COND_CLEANUP26]] ], [ [[TMP54:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC_2_LCSSA_CLONE:%.*]] = phi float [ [[ACC_2_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP57:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC_3_LCSSA_CLONE:%.*]] = phi float [ [[ACC_3_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP60:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC_4_LCSSA_CLONE:%.*]] = phi float [ [[ACC_4_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP63:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC_5_LCSSA_CLONE:%.*]] = phi float [ [[ACC_5_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP66:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC_6_LCSSA_CLONE:%.*]] = phi float [ [[ACC_6_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP69:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC_7_LCSSA_CLONE:%.*]] = phi float [ [[ACC_7_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP72:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[COEFF_POS_0_LCSSA_CLONE:%.*]] = phi i32 [ [[COEFF_POS_1_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[LSR_IV_NEXT168:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[N_0_LCSSA_CLONE:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP26]] ], [ [[TMP43]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[CMP2572_CLONE:%.*]] = icmp slt i32 [[N_0_LCSSA_CLONE]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[CMP2572_CLONE]], label [[FOR_BODY133_LR_PH:%.*]], label [[FOR_END141]]
+; CHECK:       for.cond.cleanup26.loopexit:
+; CHECK-NEXT:    [[DOTLCSSA207:%.*]] = phi float [ [[TMP47]], [[FOR_BODY14_CLONE]] ]
+; CHECK-NEXT:    [[N_0_LCSSA_NEG:%.*]] = sub i32 0, [[N_0_LCSSA]]
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP12]], [[N_0_LCSSA_NEG]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP26]]
 ; CHECK:       for.cond.cleanup26:
-; CHECK-NEXT:    [[ACC_1_LCSSA:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND23_PREHEADER]] ], [ [[TMP17:%.*]], [[FOR_BODY27]] ]
-; CHECK-NEXT:    [[INC39]] = add nuw nsw i32 [[I_080]], 1
+; CHECK-NEXT:    [[COEFF_POS_1_LCSSA]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_COND63_PREHEADER]] ], [ [[TMP48]], [[FOR_COND_CLEANUP26_LOOPEXIT]] ]
+; CHECK-NEXT:    [[ACC_1_LCSSA]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND63_PREHEADER]] ], [ [[DOTLCSSA207]], [[FOR_COND_CLEANUP26_LOOPEXIT]] ]
+; CHECK-NEXT:    [[EXITCOND85_NOT:%.*]] = icmp slt i32 [[TMP4]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND85_NOT]], label [[FOR_COND130_PREHEADER:%.*]], label [[FOR_BODY79_LR_PH:%.*]]
+; CHECK:       for.body27.7:
+; CHECK-NEXT:    [[LSR_IV167:%.*]] = phi i32 [ [[COEFF_POS_1_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[LSR_IV_NEXT168]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[LSR_IV151:%.*]] = phi i32 [ 0, [[FOR_BODY79_LR_PH]] ], [ [[LSR_IV_NEXT152:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ADD76310:%.*]] = phi i32 [ 8, [[FOR_BODY79_LR_PH]] ], [ [[ADD76:%.*]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC38:%.*]] = phi float [ [[ACC_1_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP51]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC39:%.*]] = phi float [ [[ACC_1_LCSSA2]], [[FOR_BODY79_LR_PH]] ], [ [[TMP54]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC40:%.*]] = phi float [ [[ACC_2_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP57]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC41:%.*]] = phi float [ [[ACC_3_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP60]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC42:%.*]] = phi float [ [[ACC_4_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP63]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC43:%.*]] = phi float [ [[ACC_5_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP66]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC44:%.*]] = phi float [ [[ACC_6_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP69]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[ACC45:%.*]] = phi float [ [[ACC_7_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP72]], [[FOR_BODY27_7]] ]
+; CHECK-NEXT:    [[SCEVGEP191:%.*]] = getelementptr i8, ptr [[SCEVGEP190]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[SCEVGEP191]], align 4
+; CHECK-NEXT:    [[SCEVGEP166:%.*]] = getelementptr i8, ptr [[TMP42]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP50:%.*]] = load float, ptr [[SCEVGEP166]], align 4
+; CHECK-NEXT:    [[TMP51]] = tail call float @llvm.fmuladd.f32(float [[TMP49]], float [[TMP50]], float [[ACC38]])
+; CHECK-NEXT:    [[SCEVGEP189:%.*]] = getelementptr i8, ptr [[SCEVGEP188]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[SCEVGEP189]], align 4
+; CHECK-NEXT:    [[SCEVGEP165:%.*]] = getelementptr i8, ptr [[SCEVGEP164]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[SCEVGEP165]], align 4
+; CHECK-NEXT:    [[TMP54]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP53]], float [[ACC39]])
+; CHECK-NEXT:    [[SCEVGEP186:%.*]] = getelementptr i8, ptr [[SCEVGEP185]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load float, ptr [[SCEVGEP186]], align 4
+; CHECK-NEXT:    [[SCEVGEP163:%.*]] = getelementptr i8, ptr [[SCEVGEP162]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load float, ptr [[SCEVGEP163]], align 4
+; CHECK-NEXT:    [[TMP57]] = tail call float @llvm.fmuladd.f32(float [[TMP55]], float [[TMP56]], float [[ACC40]])
+; CHECK-NEXT:    [[SCEVGEP183:%.*]] = getelementptr i8, ptr [[SCEVGEP182]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP58:%.*]] = load float, ptr [[SCEVGEP183]], align 4
+; CHECK-NEXT:    [[SCEVGEP161:%.*]] = getelementptr i8, ptr [[SCEVGEP160]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[SCEVGEP161]], align 4
+; CHECK-NEXT:    [[TMP60]] = tail call float @llvm.fmuladd.f32(float [[TMP58]], float [[TMP59]], float [[ACC41]])
+; CHECK-NEXT:    [[SCEVGEP180:%.*]] = getelementptr i8, ptr [[SCEVGEP179]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[SCEVGEP180]], align 4
+; CHECK-NEXT:    [[SCEVGEP159:%.*]] = getelementptr i8, ptr [[SCEVGEP158]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP62:%.*]] = load float, ptr [[SCEVGEP159]], align 4
+; CHECK-NEXT:    [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP61]], float [[TMP62]], float [[ACC42]])
+; CHECK-NEXT:    [[SCEVGEP177:%.*]] = getelementptr i8, ptr [[SCEVGEP176]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP64:%.*]] = load float, ptr [[SCEVGEP177]], align 4
+; CHECK-NEXT:    [[SCEVGEP157:%.*]] = getelementptr i8, ptr [[SCEVGEP156]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP65:%.*]] = load float, ptr [[SCEVGEP157]], align 4
+; CHECK-NEXT:    [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[ACC43]])
+; CHECK-NEXT:    [[SCEVGEP174:%.*]] = getelementptr i8, ptr [[SCEVGEP173]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP67:%.*]] = load float, ptr [[SCEVGEP174]], align 4
+; CHECK-NEXT:    [[SCEVGEP155:%.*]] = getelementptr i8, ptr [[SCEVGEP154]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP68:%.*]] = load float, ptr [[SCEVGEP155]], align 4
+; CHECK-NEXT:    [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[ACC44]])
+; CHECK-NEXT:    [[SCEVGEP171:%.*]] = getelementptr i8, ptr [[SCEVGEP170]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP70:%.*]] = load float, ptr [[SCEVGEP171]], align 4
+; CHECK-NEXT:    [[SCEVGEP153:%.*]] = getelementptr i8, ptr [[SCEVGEP150]], i32 [[LSR_IV151]]
+; CHECK-NEXT:    [[TMP71:%.*]] = load float, ptr [[SCEVGEP153]], align 4
+; CHECK-NEXT:    [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP70]], float [[TMP71]], float [[ACC45]])
+; CHECK-NEXT:    [[ADD76]] = add nuw nsw i32 [[ADD76310]], 8
+; CHECK-NEXT:    [[LSR_IV_NEXT152]] = add nuw i32 [[LSR_IV151]], 32
+; CHECK-NEXT:    [[LSR_IV_NEXT168]] = add i32 [[LSR_IV167]], 8
+; CHECK-NEXT:    [[EXITCOND84_NOT_7:%.*]] = icmp sgt i32 [[ADD76]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[EXITCOND84_NOT_7]], label [[FOR_COND130_PREHEADER]], label [[FOR_BODY27_7]]
+; CHECK:       for.body133.lr.ph:
+; CHECK-NEXT:    [[TMP73:%.*]] = load ptr, ptr [[FIR]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = load ptr, ptr [[DELAY]], align 4
+; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP4]], [[N_0_LCSSA_CLONE]]
+; CHECK-NEXT:    [[TMP76:%.*]] = shl i32 [[N_0_LCSSA_CLONE]], 2
+; CHECK-NEXT:    [[SCEVGEP194:%.*]] = getelementptr i8, ptr [[TMP74]], i32 [[TMP76]]
+; CHECK-NEXT:    [[TMP77:%.*]] = shl i32 [[COEFF_POS_0_LCSSA_CLONE]], 2
+; CHECK-NEXT:    [[SCEVGEP197:%.*]] = getelementptr i8, ptr [[TMP73]], i32 [[TMP77]]
+; CHECK-NEXT:    br label [[FOR_BODY27_CLONE:%.*]]
+; CHECK:       for.body27.clone:
+; CHECK-NEXT:    [[LSR_IV198:%.*]] = phi ptr [ [[SCEVGEP199:%.*]], [[FOR_BODY27_CLONE]] ], [ [[SCEVGEP197]], [[FOR_BODY133_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV195:%.*]] = phi ptr [ [[SCEVGEP196:%.*]], [[FOR_BODY27_CLONE]] ], [ [[SCEVGEP194]], [[FOR_BODY133_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV192:%.*]] = phi i32 [ [[LSR_IV_NEXT193:%.*]], [[FOR_BODY27_CLONE]] ], [ [[TMP75]], [[FOR_BODY133_LR_PH]] ]
+; CHECK-NEXT:    [[ACC_173_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_BODY133_LR_PH]] ], [ [[TMP80:%.*]], [[FOR_BODY27_CLONE]] ]
+; CHECK-NEXT:    [[TMP78:%.*]] = load float, ptr [[LSR_IV198]], align 4
+; CHECK-NEXT:    [[TMP79:%.*]] = load float, ptr [[LSR_IV195]], align 4
+; CHECK-NEXT:    [[TMP80]] = tail call float @llvm.fmuladd.f32(float [[TMP78]], float [[TMP79]], float [[ACC_173_CLONE]])
+; CHECK-NEXT:    [[LSR_IV_NEXT193]] = add i32 [[LSR_IV192]], -1
+; CHECK-NEXT:    [[SCEVGEP196]] = getelementptr i8, ptr [[LSR_IV195]], i32 4
+; CHECK-NEXT:    [[SCEVGEP199]] = getelementptr i8, ptr [[LSR_IV198]], i32 4
+; CHECK-NEXT:    [[EXITCOND84_NOT_CLONE:%.*]] = icmp eq i32 [[LSR_IV_NEXT193]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND84_NOT_CLONE]], label [[FOR_END141]], label [[FOR_BODY27_CLONE]]
+; CHECK:       for.end141:
+; CHECK-NEXT:    [[ACC0_3_LCSSA:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_COND130_PREHEADER]] ], [ [[TMP80]], [[FOR_BODY27_CLONE]] ]
+; CHECK-NEXT:    [[ADD60:%.*]] = fadd float [[ACC_1_LCSSA2_CLONE]], [[ACC0_3_LCSSA]]
+; CHECK-NEXT:    [[ADD6179:%.*]] = fadd float [[ACC_2_LCSSA_CLONE]], [[ACC_3_LCSSA_CLONE]]
+; CHECK-NEXT:    [[ADD62:%.*]] = fadd float [[ACC_4_LCSSA_CLONE]], [[ACC_5_LCSSA_CLONE]]
+; CHECK-NEXT:    [[ADD6380:%.*]] = fadd float [[ACC_6_LCSSA_CLONE]], [[ACC_7_LCSSA_CLONE]]
+; CHECK-NEXT:    [[ADD64:%.*]] = fadd float [[ADD6179]], [[ADD60]]
+; CHECK-NEXT:    [[ADD6581:%.*]] = fadd float [[ADD62]], [[ADD6380]]
+; CHECK-NEXT:    [[ADD66:%.*]] = fadd float [[ADD6581]], [[ADD64]]
 ; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_080]]
-; CHECK-NEXT:    store float [[ACC_1_LCSSA]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[EXITCOND85_NOT:%.*]] = icmp eq i32 [[INC39]], [[LEN]]
-; CHECK-NEXT:    br i1 [[EXITCOND85_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
-; CHECK:       for.body27:
-; CHECK-NEXT:    [[N22_075:%.*]] = phi i32 [ 0, [[FOR_BODY27_LR_PH]] ], [ [[INC34:%.*]], [[FOR_BODY27]] ]
-; CHECK-NEXT:    [[COEFF_POS_174:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_BODY27_LR_PH]] ], [ [[INC29:%.*]], [[FOR_BODY27]] ]
-; CHECK-NEXT:    [[ACC_173:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY27_LR_PH]] ], [ [[TMP17]], [[FOR_BODY27]] ]
-; CHECK-NEXT:    [[INC29]] = add nuw nsw i32 [[COEFF_POS_174]], 1
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[COEFF_POS_174]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 [[N22_075]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT:    [[TMP17]] = tail call float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[ACC_173]])
-; CHECK-NEXT:    [[INC34]] = add nuw nsw i32 [[N22_075]], 1
-; CHECK-NEXT:    [[EXITCOND84_NOT:%.*]] = icmp eq i32 [[INC34]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[EXITCOND84_NOT]], label [[FOR_COND_CLEANUP26]], label [[FOR_BODY27]]
+; CHECK-NEXT:    store float [[ADD66]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[INC152]] = add nuw nsw i32 [[I_080]], 1
+; CHECK-NEXT:    [[EXITCOND350_NOT:%.*]] = icmp eq i32 [[INC152]], [[LEN]]
+; CHECK-NEXT:    br i1 [[EXITCOND350_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
 ;
 entry:
   %cmp77 = icmp sgt i32 %len, 0
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll
index a4fb7808a4f8ee..aa9f66e46f4e89 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll
@@ -1,23 +1,79 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local float @test_loop(ptr nocapture noundef readonly %data1, ptr nocapture noundef readonly %data2) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local float @test_loop(
-; CHECK-SAME: ptr nocapture noundef readonly [[DATA1:%.*]], ptr nocapture noundef readonly [[DATA2:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[DATA1:%.*]], ptr noalias nocapture noundef readonly [[DATA2:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret float [[TMP2:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ADD37:%.*]] = fadd float [[TMP16:%.*]], [[TMP17:%.*]]
+; CHECK-NEXT:    [[ADD38:%.*]] = fadd float [[TMP18:%.*]], [[TMP19:%.*]]
+; CHECK-NEXT:    [[ADD39:%.*]] = fadd float [[TMP20:%.*]], [[TMP21:%.*]]
+; CHECK-NEXT:    [[ADD40:%.*]] = fadd float [[TMP22:%.*]], [[TMP23:%.*]]
+; CHECK-NEXT:    [[ADD41:%.*]] = fadd float [[ADD37]], [[ADD38]]
+; CHECK-NEXT:    [[ADD42:%.*]] = fadd float [[ADD39]], [[ADD40]]
+; CHECK-NEXT:    [[ADD43:%.*]] = fadd float [[ADD41]], [[ADD42]]
+; CHECK-NEXT:    ret float [[ADD43]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[RESULT_06:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP2]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI4:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP20]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI6:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP22]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTPHI7:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC_7]] = add nuw nsw i32 [[I_07]], 8
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i32 [[I_07]], 1
+; CHECK-NEXT:    [[ADD9:%.*]] = or disjoint i32 [[I_07]], 2
+; CHECK-NEXT:    [[ADD11:%.*]] = or disjoint i32 [[I_07]], 3
+; CHECK-NEXT:    [[ADD13:%.*]] = or disjoint i32 [[I_07]], 4
+; CHECK-NEXT:    [[ADD15:%.*]] = or disjoint i32 [[I_07]], 5
+; CHECK-NEXT:    [[ADD17:%.*]] = or disjoint i32 [[I_07]], 6
+; CHECK-NEXT:    [[ADD19:%.*]] = or disjoint i32 [[I_07]], 7
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[I_07]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[I_07]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX1_1:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD9]]
+; CHECK-NEXT:    [[ARRAYIDX1_2:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD9]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD11]]
+; CHECK-NEXT:    [[ARRAYIDX1_3:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD11]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD13]]
+; CHECK-NEXT:    [[ARRAYIDX1_4:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD13]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD15]]
+; CHECK-NEXT:    [[ARRAYIDX1_5:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD15]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD17]]
+; CHECK-NEXT:    [[ARRAYIDX1_6:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD17]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD19]]
+; CHECK-NEXT:    [[ARRAYIDX1_7:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD19]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TMP2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[RESULT_06]])
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX1_1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX1_3]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1_4]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX1_5]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX1_6]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX1_7]], align 4
+; CHECK-NEXT:    [[TMP16]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[DOTPHI]])
+; CHECK-NEXT:    [[TMP17]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[DOTPHI1]])
+; CHECK-NEXT:    [[TMP18]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[DOTPHI2]])
+; CHECK-NEXT:    [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[DOTPHI3]])
+; CHECK-NEXT:    [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[DOTPHI4]])
+; CHECK-NEXT:    [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[DOTPHI5]])
+; CHECK-NEXT:    [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[DOTPHI6]])
+; CHECK-NEXT:    [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[DOTPHI7]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC_7]], 1009
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll
index bcf9852fd491ee..1a6c4fda2b5126 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, ptr noundef readonly %input2, ptr noundef writeonly %output, i32 noundef %len, i32 noundef %step1, i32 noundef %step2, i32 noundef %step_out) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_mul_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[INPUT1:%.*]], ptr noundef readonly [[INPUT2:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[INPUT1:%.*]], ptr noalias noundef readonly [[INPUT2:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[INPUT1]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[INPUT2]], null
@@ -12,19 +12,159 @@ define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    br i1 [[OR_COND20]], label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[CMP721:%.*]] = icmp sgt i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[CMP721]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]]
+; CHECK:       for.cond.preheader.new:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[LEN]], -16
+; CHECK-NEXT:    [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16
+; CHECK-NEXT:    br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]]
+; CHECK:       for.cond.preheader.new2:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]]
+; CHECK:       for.body.modify:
+; CHECK-NEXT:    [[I_022_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[TMP1]] = add nuw i32 [[I_022_MODIFY]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i32 [[I_022_MODIFY]], 1
+; CHECK-NEXT:    [[ADD4:%.*]] = or disjoint i32 [[I_022_MODIFY]], 2
+; CHECK-NEXT:    [[ADD8:%.*]] = or disjoint i32 [[I_022_MODIFY]], 3
+; CHECK-NEXT:    [[ADD14:%.*]] = or disjoint i32 [[I_022_MODIFY]], 4
+; CHECK-NEXT:    [[ADD18:%.*]] = or disjoint i32 [[I_022_MODIFY]], 5
+; CHECK-NEXT:    [[ADD22:%.*]] = or disjoint i32 [[I_022_MODIFY]], 6
+; CHECK-NEXT:    [[ADD26:%.*]] = or disjoint i32 [[I_022_MODIFY]], 7
+; CHECK-NEXT:    [[ADD30:%.*]] = or disjoint i32 [[I_022_MODIFY]], 8
+; CHECK-NEXT:    [[ADD34:%.*]] = or disjoint i32 [[I_022_MODIFY]], 9
+; CHECK-NEXT:    [[ADD38:%.*]] = or disjoint i32 [[I_022_MODIFY]], 10
+; CHECK-NEXT:    [[ADD42:%.*]] = or disjoint i32 [[I_022_MODIFY]], 11
+; CHECK-NEXT:    [[ADD46:%.*]] = or disjoint i32 [[I_022_MODIFY]], 12
+; CHECK-NEXT:    [[ADD50:%.*]] = or disjoint i32 [[I_022_MODIFY]], 13
+; CHECK-NEXT:    [[ADD54:%.*]] = or disjoint i32 [[I_022_MODIFY]], 14
+; CHECK-NEXT:    [[ADD58:%.*]] = or disjoint i32 [[I_022_MODIFY]], 15
+; CHECK-NEXT:    [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[I_022_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX9_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[I_022_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX12_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_022_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD4]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD4]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD4]]
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD8]]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD8]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD8]]
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX53:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX40]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[ARRAYIDX43]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX47]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load float, ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX52]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX55]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX56]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX59]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX60]], align 4
+; CHECK-NEXT:    [[MUL10_MODIFY:%.*]] = fmul float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fmul float [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP36:%.*]] = fmul float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP37:%.*]] = fmul float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP38:%.*]] = fmul float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul float [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fmul float [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fmul float [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fmul float [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP43:%.*]] = fmul float [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fmul float [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP45:%.*]] = fmul float [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP46:%.*]] = fmul float [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP47:%.*]] = fmul float [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP48:%.*]] = fmul float [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    store float [[MUL10_MODIFY]], ptr [[ARRAYIDX12_MODIFY]], align 4
+; CHECK-NEXT:    store float [[TMP34]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    store float [[TMP35]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    store float [[TMP36]], ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    store float [[TMP37]], ptr [[ARRAYIDX17]], align 4
+; CHECK-NEXT:    store float [[TMP38]], ptr [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    store float [[TMP39]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    store float [[TMP40]], ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    store float [[TMP41]], ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    store float [[TMP42]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    store float [[TMP43]], ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    store float [[TMP44]], ptr [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    store float [[TMP45]], ptr [[ARRAYIDX49]], align 4
+; CHECK-NEXT:    store float [[TMP46]], ptr [[ARRAYIDX53]], align 4
+; CHECK-NEXT:    store float [[TMP47]], ptr [[ARRAYIDX57]], align 4
+; CHECK-NEXT:    store float [[TMP48]], ptr [[ARRAYIDX61]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_022:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ]
+; CHECK-NEXT:    [[I_022:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_022]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[I_022]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[MUL10:%.*]] = fmul float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP50:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[MUL10:%.*]] = fmul float [[TMP49]], [[TMP50]]
 ; CHECK-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[I_022]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL11]]
 ; CHECK-NEXT:    store float [[MUL10]], ptr [[ARRAYIDX12]], align 4
@@ -35,11 +175,11 @@ define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    [[I_022_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[MUL_CLONE:%.*]] = mul nsw i32 [[I_022_CLONE]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL_CLONE]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
 ; CHECK-NEXT:    [[MUL8_CLONE:%.*]] = mul nsw i32 [[I_022_CLONE]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX9_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8_CLONE]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4
-; CHECK-NEXT:    [[MUL10_CLONE:%.*]] = fmul float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4
+; CHECK-NEXT:    [[MUL10_CLONE:%.*]] = fmul float [[TMP51]], [[TMP52]]
 ; CHECK-NEXT:    [[MUL11_CLONE:%.*]] = mul nsw i32 [[I_022_CLONE]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX12_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL11_CLONE]]
 ; CHECK-NEXT:    store float [[MUL10_CLONE]], ptr [[ARRAYIDX12_CLONE]], align 4
@@ -47,7 +187,7 @@ define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll
index 2c81f5bfd4b6f3..bf4e757def1373 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %input, ptr noalias noundef writeonly %output, i32 noundef %len, float noundef %C, i32 noundef %step_in, i32 noundef %step_out) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_mulc_f32_ansi(
 ; CHECK-SAME: ptr noalias noundef readonly [[INPUT:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], float noundef [[C:%.*]], i32 noundef [[STEP_IN:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
@@ -10,16 +10,124 @@ define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %i
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[CMP413:%.*]] = icmp sgt i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[CMP413]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]]
+; CHECK:       for.cond.preheader.new:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[LEN]], -16
+; CHECK-NEXT:    [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16
+; CHECK-NEXT:    br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]]
+; CHECK:       for.cond.preheader.new2:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]]
+; CHECK:       for.body.modify:
+; CHECK-NEXT:    [[I_014_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[TMP1]] = add nuw i32 [[I_014_MODIFY]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i32 [[I_014_MODIFY]], 1
+; CHECK-NEXT:    [[ADD3:%.*]] = or disjoint i32 [[I_014_MODIFY]], 2
+; CHECK-NEXT:    [[ADD6:%.*]] = or disjoint i32 [[I_014_MODIFY]], 3
+; CHECK-NEXT:    [[ADD10:%.*]] = or disjoint i32 [[I_014_MODIFY]], 4
+; CHECK-NEXT:    [[ADD13:%.*]] = or disjoint i32 [[I_014_MODIFY]], 5
+; CHECK-NEXT:    [[ADD16:%.*]] = or disjoint i32 [[I_014_MODIFY]], 6
+; CHECK-NEXT:    [[ADD19:%.*]] = or disjoint i32 [[I_014_MODIFY]], 7
+; CHECK-NEXT:    [[ADD22:%.*]] = or disjoint i32 [[I_014_MODIFY]], 8
+; CHECK-NEXT:    [[ADD25:%.*]] = or disjoint i32 [[I_014_MODIFY]], 9
+; CHECK-NEXT:    [[ADD28:%.*]] = or disjoint i32 [[I_014_MODIFY]], 10
+; CHECK-NEXT:    [[ADD31:%.*]] = or disjoint i32 [[I_014_MODIFY]], 11
+; CHECK-NEXT:    [[ADD34:%.*]] = or disjoint i32 [[I_014_MODIFY]], 12
+; CHECK-NEXT:    [[ADD37:%.*]] = or disjoint i32 [[I_014_MODIFY]], 13
+; CHECK-NEXT:    [[ADD40:%.*]] = or disjoint i32 [[I_014_MODIFY]], 14
+; CHECK-NEXT:    [[ADD43:%.*]] = or disjoint i32 [[I_014_MODIFY]], 15
+; CHECK-NEXT:    [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_014_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX7_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_014_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD3]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD3]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD6]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD6]]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD10]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD10]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD13]]
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD13]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD16]]
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD16]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD19]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD19]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD25]]
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD25]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD28]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD28]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD31]]
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD31]]
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX38:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD37]]
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD37]]
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD40]]
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD40]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD43]]
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD43]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX17]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX38]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[MUL5_MODIFY:%.*]] = fmul float [[C]], [[TMP2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul float [[C]], [[TMP3]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul float [[C]], [[TMP4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul float [[C]], [[TMP5]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul float [[C]], [[TMP6]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul float [[C]], [[TMP7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul float [[C]], [[TMP8]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul float [[C]], [[TMP9]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fmul float [[C]], [[TMP10]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul float [[C]], [[TMP11]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul float [[C]], [[TMP12]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul float [[C]], [[TMP13]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul float [[C]], [[TMP14]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fmul float [[C]], [[TMP15]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul float [[C]], [[TMP16]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul float [[C]], [[TMP17]]
+; CHECK-NEXT:    store float [[MUL5_MODIFY]], ptr [[ARRAYIDX7_MODIFY]], align 4
+; CHECK-NEXT:    store float [[TMP18]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    store float [[TMP19]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    store float [[TMP20]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    store float [[TMP21]], ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    store float [[TMP22]], ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    store float [[TMP23]], ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    store float [[TMP24]], ptr [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    store float [[TMP25]], ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    store float [[TMP26]], ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    store float [[TMP27]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    store float [[TMP28]], ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    store float [[TMP29]], ptr [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    store float [[TMP30]], ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    store float [[TMP31]], ptr [[ARRAYIDX42]], align 4
+; CHECK-NEXT:    store float [[TMP32]], ptr [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ]
+; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_014]], [[STEP_IN]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL5:%.*]] = fmul float [[TMP0]], [[C]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul float [[C]], [[TMP33]]
 ; CHECK-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[I_014]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL6]]
 ; CHECK-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX7]], align 4
@@ -30,8 +138,8 @@ define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %i
 ; CHECK-NEXT:    [[I_014_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[MUL_CLONE:%.*]] = mul nsw i32 [[I_014_CLONE]], [[STEP_IN]]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL_CLONE]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
-; CHECK-NEXT:    [[MUL5_CLONE:%.*]] = fmul float [[TMP1]], [[C]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[MUL5_CLONE:%.*]] = fmul float [[C]], [[TMP34]]
 ; CHECK-NEXT:    [[MUL6_CLONE:%.*]] = mul nsw i32 [[I_014_CLONE]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX7_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL6_CLONE]]
 ; CHECK-NEXT:    store float [[MUL5_CLONE]], ptr [[ARRAYIDX7_CLONE]], align 4
@@ -39,7 +147,7 @@ define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %i
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll
index 99ac2877f76c6b..89c891af406691 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, ptr noundef writeonly %output, i32 noundef %len) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_sqrt_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[INPUT:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[INPUT:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[INPUT]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[OUTPUT]], null
@@ -10,15 +10,139 @@ define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, pt
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[CMP411:%.*]] = icmp sgt i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[CMP411]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]]
+; CHECK:       for.cond.preheader.new:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[LEN]], -16
+; CHECK-NEXT:    [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16
+; CHECK-NEXT:    br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]]
+; CHECK:       for.cond.preheader.new2:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP32:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]]
+; CHECK:       for.body.modify:
+; CHECK-NEXT:    [[I_012_MODIFY:%.*]] = phi i32 [ [[TMP32]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_012_MODIFY]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_MODIFY]], align 4
+; CHECK-NEXT:    [[SHR_I_MODIFY:%.*]] = ashr i32 [[TMP1]], 1
+; CHECK-NEXT:    [[ADD48:%.*]] = or disjoint i32 [[SHR_I_MODIFY]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX5_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_012_MODIFY]]
+; CHECK-NEXT:    store i32 [[ADD48]], ptr [[ARRAYIDX5_MODIFY]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i32 [[I_012_MODIFY]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP2]], 1
+; CHECK-NEXT:    [[ADD50:%.*]] = or disjoint i32 [[TMP3]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]]
+; CHECK-NEXT:    store i32 [[ADD50]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = or disjoint i32 [[I_012_MODIFY]], 2
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr i32 [[TMP4]], 1
+; CHECK-NEXT:    [[ADD52:%.*]] = or disjoint i32 [[TMP5]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD3]]
+; CHECK-NEXT:    store i32 [[ADD52]], ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = or disjoint i32 [[I_012_MODIFY]], 3
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ashr i32 [[TMP6]], 1
+; CHECK-NEXT:    [[ADD54:%.*]] = or disjoint i32 [[TMP7]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD7]]
+; CHECK-NEXT:    store i32 [[ADD54]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ADD10:%.*]] = or disjoint i32 [[I_012_MODIFY]], 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr i32 [[TMP8]], 1
+; CHECK-NEXT:    [[ADD56:%.*]] = or disjoint i32 [[TMP9]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD10]]
+; CHECK-NEXT:    store i32 [[ADD56]], ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ADD13:%.*]] = or disjoint i32 [[I_012_MODIFY]], 5
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD13]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = ashr i32 [[TMP10]], 1
+; CHECK-NEXT:    [[ADD58:%.*]] = or disjoint i32 [[TMP11]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD13]]
+; CHECK-NEXT:    store i32 [[ADD58]], ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[ADD16:%.*]] = or disjoint i32 [[I_012_MODIFY]], 6
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD16]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = ashr i32 [[TMP12]], 1
+; CHECK-NEXT:    [[ADD60:%.*]] = or disjoint i32 [[TMP13]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD16]]
+; CHECK-NEXT:    store i32 [[ADD60]], ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ADD19:%.*]] = or disjoint i32 [[I_012_MODIFY]], 7
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD19]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = ashr i32 [[TMP14]], 1
+; CHECK-NEXT:    [[ADD62:%.*]] = or disjoint i32 [[TMP15]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD19]]
+; CHECK-NEXT:    store i32 [[ADD62]], ptr [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    [[ADD22:%.*]] = or disjoint i32 [[I_012_MODIFY]], 8
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr i32 [[TMP16]], 1
+; CHECK-NEXT:    [[ADD64:%.*]] = or disjoint i32 [[TMP17]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    store i32 [[ADD64]], ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ADD25:%.*]] = or disjoint i32 [[I_012_MODIFY]], 9
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD25]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr i32 [[TMP18]], 1
+; CHECK-NEXT:    [[ADD66:%.*]] = or disjoint i32 [[TMP19]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD25]]
+; CHECK-NEXT:    store i32 [[ADD66]], ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[ADD28:%.*]] = or disjoint i32 [[I_012_MODIFY]], 10
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD28]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr i32 [[TMP20]], 1
+; CHECK-NEXT:    [[ADD68:%.*]] = or disjoint i32 [[TMP21]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD28]]
+; CHECK-NEXT:    store i32 [[ADD68]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    [[ADD31:%.*]] = or disjoint i32 [[I_012_MODIFY]], 11
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD31]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr i32 [[TMP22]], 1
+; CHECK-NEXT:    [[ADD70:%.*]] = or disjoint i32 [[TMP23]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD31]]
+; CHECK-NEXT:    store i32 [[ADD70]], ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    [[ADD34:%.*]] = or disjoint i32 [[I_012_MODIFY]], 12
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr i32 [[TMP24]], 1
+; CHECK-NEXT:    [[ADD72:%.*]] = or disjoint i32 [[TMP25]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    store i32 [[ADD72]], ptr [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    [[ADD37:%.*]] = or disjoint i32 [[I_012_MODIFY]], 13
+; CHECK-NEXT:    [[ARRAYIDX38:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD37]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX38]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = ashr i32 [[TMP26]], 1
+; CHECK-NEXT:    [[ADD74:%.*]] = or disjoint i32 [[TMP27]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD37]]
+; CHECK-NEXT:    store i32 [[ADD74]], ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[ADD40:%.*]] = or disjoint i32 [[I_012_MODIFY]], 14
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD40]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = ashr i32 [[TMP28]], 1
+; CHECK-NEXT:    [[ADD76:%.*]] = or disjoint i32 [[TMP29]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD40]]
+; CHECK-NEXT:    store i32 [[ADD76]], ptr [[ARRAYIDX42]], align 4
+; CHECK-NEXT:    [[ADD43:%.*]] = or disjoint i32 [[I_012_MODIFY]], 15
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD43]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = ashr i32 [[TMP30]], 1
+; CHECK-NEXT:    [[ADD78:%.*]] = or disjoint i32 [[TMP31]], 532365312
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD43]]
+; CHECK-NEXT:    store i32 [[ADD78]], ptr [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    [[TMP32]] = add nuw i32 [[I_012_MODIFY]], 16
+; CHECK-NEXT:    [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP32]], [[SUB]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ]
+; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_012]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[SHR_I:%.*]] = ashr i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SHR_I:%.*]] = ashr i32 [[TMP33]], 1
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add nsw i32 [[SHR_I]], 532365312
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_012]]
 ; CHECK-NEXT:    store i32 [[ADD_I]], ptr [[ARRAYIDX5]], align 4
@@ -28,8 +152,8 @@ define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, pt
 ; CHECK:       for.body.clone:
 ; CHECK-NEXT:    [[I_012_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_012_CLONE]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_CLONE]], align 4
-; CHECK-NEXT:    [[SHR_I_CLONE:%.*]] = ashr i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[SHR_I_CLONE:%.*]] = ashr i32 [[TMP34]], 1
 ; CHECK-NEXT:    [[ADD_I_CLONE:%.*]] = add nsw i32 [[SHR_I_CLONE]], 532365312
 ; CHECK-NEXT:    [[ARRAYIDX5_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_012_CLONE]]
 ; CHECK-NEXT:    store i32 [[ADD_I_CLONE]], ptr [[ARRAYIDX5_CLONE]], align 4
@@ -37,7 +161,7 @@ define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, pt
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll
index 9468a11ba62329..19bca2d13e120e 100644
--- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll
+++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s
 define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, ptr noundef readonly %input2, ptr noundef writeonly %output, i32 noundef %len, i32 noundef %step1, i32 noundef %step2, i32 noundef %step_out) local_unnamed_addr {
 ; CHECK-LABEL: define dso_local noundef i32 @dsps_sub_f32_ansi(
-; CHECK-SAME: ptr noundef readonly [[INPUT1:%.*]], ptr noundef readonly [[INPUT2:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr noalias noundef readonly [[INPUT1:%.*]], ptr noalias noundef readonly [[INPUT2:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[INPUT1]], null
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[INPUT2]], null
@@ -12,19 +12,159 @@ define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    br i1 [[OR_COND19]], label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[LEN]], 2
-; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[CMP720:%.*]] = icmp sgt i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[CMP720]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]]
+; CHECK:       for.cond.preheader.new:
+; CHECK-NEXT:    [[SUB63:%.*]] = add nsw i32 [[LEN]], -16
+; CHECK-NEXT:    [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16
+; CHECK-NEXT:    br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]]
+; CHECK:       for.cond.preheader.new2:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]]
+; CHECK:       for.body.modify:
+; CHECK-NEXT:    [[I_021_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[TMP1]] = add nuw i32 [[I_021_MODIFY]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i32 [[I_021_MODIFY]], 1
+; CHECK-NEXT:    [[ADD4:%.*]] = or disjoint i32 [[I_021_MODIFY]], 2
+; CHECK-NEXT:    [[ADD8:%.*]] = or disjoint i32 [[I_021_MODIFY]], 3
+; CHECK-NEXT:    [[ADD14:%.*]] = or disjoint i32 [[I_021_MODIFY]], 4
+; CHECK-NEXT:    [[ADD18:%.*]] = or disjoint i32 [[I_021_MODIFY]], 5
+; CHECK-NEXT:    [[ADD22:%.*]] = or disjoint i32 [[I_021_MODIFY]], 6
+; CHECK-NEXT:    [[ADD26:%.*]] = or disjoint i32 [[I_021_MODIFY]], 7
+; CHECK-NEXT:    [[ADD30:%.*]] = or disjoint i32 [[I_021_MODIFY]], 8
+; CHECK-NEXT:    [[ADD34:%.*]] = or disjoint i32 [[I_021_MODIFY]], 9
+; CHECK-NEXT:    [[ADD38:%.*]] = or disjoint i32 [[I_021_MODIFY]], 10
+; CHECK-NEXT:    [[ADD42:%.*]] = or disjoint i32 [[I_021_MODIFY]], 11
+; CHECK-NEXT:    [[ADD46:%.*]] = or disjoint i32 [[I_021_MODIFY]], 12
+; CHECK-NEXT:    [[ADD50:%.*]] = or disjoint i32 [[I_021_MODIFY]], 13
+; CHECK-NEXT:    [[ADD54:%.*]] = or disjoint i32 [[I_021_MODIFY]], 14
+; CHECK-NEXT:    [[ADD58:%.*]] = or disjoint i32 [[I_021_MODIFY]], 15
+; CHECK-NEXT:    [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[I_021_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX9_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[I_021_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX11_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_021_MODIFY]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD4]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD4]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD4]]
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD8]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD8]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD8]]
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD14]]
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]]
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD26]]
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD30]]
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]]
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]]
+; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD42]]
+; CHECK-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD46]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX53:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD50]]
+; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD54]]
+; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD58]]
+; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_MODIFY]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX40]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[ARRAYIDX43]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX47]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load float, ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX52]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX55]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX56]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX59]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX60]], align 4
+; CHECK-NEXT:    [[SUB_MODIFY:%.*]] = fsub float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fsub float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fsub float [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP36:%.*]] = fsub float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP37:%.*]] = fsub float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP38:%.*]] = fsub float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fsub float [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fsub float [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fsub float [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fsub float [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP43:%.*]] = fsub float [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fsub float [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP45:%.*]] = fsub float [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP46:%.*]] = fsub float [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP47:%.*]] = fsub float [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP48:%.*]] = fsub float [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    store float [[SUB_MODIFY]], ptr [[ARRAYIDX11_MODIFY]], align 4
+; CHECK-NEXT:    store float [[TMP34]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    store float [[TMP35]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    store float [[TMP36]], ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    store float [[TMP37]], ptr [[ARRAYIDX17]], align 4
+; CHECK-NEXT:    store float [[TMP38]], ptr [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    store float [[TMP39]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    store float [[TMP40]], ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    store float [[TMP41]], ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    store float [[TMP42]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    store float [[TMP43]], ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    store float [[TMP44]], ptr [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    store float [[TMP45]], ptr [[ARRAYIDX49]], align 4
+; CHECK-NEXT:    store float [[TMP46]], ptr [[ARRAYIDX53]], align 4
+; CHECK-NEXT:    store float [[TMP47]], ptr [[ARRAYIDX57]], align 4
+; CHECK-NEXT:    store float [[TMP48]], ptr [[ARRAYIDX61]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB63]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ]
+; CHECK-NEXT:    [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_021]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[I_021]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP50:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[TMP49]], [[TMP50]]
 ; CHECK-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[I_021]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10]]
 ; CHECK-NEXT:    store float [[SUB]], ptr [[ARRAYIDX11]], align 4
@@ -35,11 +175,11 @@ define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    [[I_021_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[MUL_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP1]]
 ; CHECK-NEXT:    [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL_CLONE]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4
 ; CHECK-NEXT:    [[MUL8_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP2]]
 ; CHECK-NEXT:    [[ARRAYIDX9_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8_CLONE]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4
-; CHECK-NEXT:    [[SUB_CLONE:%.*]] = fsub float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4
+; CHECK-NEXT:    [[SUB_CLONE:%.*]] = fsub float [[TMP51]], [[TMP52]]
 ; CHECK-NEXT:    [[MUL10_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP_OUT]]
 ; CHECK-NEXT:    [[ARRAYIDX11_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10_CLONE]]
 ; CHECK-NEXT:    store float [[SUB_CLONE]], ptr [[ARRAYIDX11_CLONE]], align 4
@@ -47,7 +187,7 @@ define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, pt
 ; CHECK-NEXT:    [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry: