From 45f4abc5a6345ba0e4edc0c24df08ef38f1c3436 Mon Sep 17 00:00:00 2001 From: "chen.qian" Date: Fri, 15 Nov 2024 16:15:20 +0800 Subject: [PATCH] [Pass] add LoopUnrollAndRemainder pass --- llvm/lib/Target/RISCV/CMakeLists.txt | 1 + .../RISCV/RISCVLoopUnrollAndRemainder.cpp | 5053 +++++++++++++++++ .../RISCV/RISCVLoopUnrollAndRemainder.h | 42 + llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 7 + .../RISCV/RISCVLoopUnrollAndRemainder/add.ll | 162 +- .../RISCV/RISCVLoopUnrollAndRemainder/addc.ll | 126 +- .../RISCVLoopUnrollAndRemainder/ccorr.ll | 454 +- .../RISCV/RISCVLoopUnrollAndRemainder/conv.ll | 450 +- .../RISCV/RISCVLoopUnrollAndRemainder/corr.ll | 242 +- .../RISCVLoopUnrollAndRemainder/dotprod.ll | 129 +- .../dotprod_template_complex.ll | 115 +- .../RISCVLoopUnrollAndRemainder/dotprode.ll | 131 +- .../RISCV/RISCVLoopUnrollAndRemainder/fir.ll | 299 +- .../RISCV/RISCVLoopUnrollAndRemainder/fird.ll | 327 +- .../loopsecvconstant.ll | 78 +- .../RISCV/RISCVLoopUnrollAndRemainder/mul.ll | 162 +- .../RISCV/RISCVLoopUnrollAndRemainder/mulc.ll | 124 +- .../RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll | 142 +- .../RISCV/RISCVLoopUnrollAndRemainder/sub.ll | 162 +- 19 files changed, 7831 insertions(+), 375 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp create mode 100644 llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 654c84b0695c27..05581a9a9af296 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_target(RISCVCodeGen RISCVGatherScatterLowering.cpp RISCVSplitLoopByLength.cpp RISCVCustomLICM.cpp + RISCVLoopUnrollAndRemainder.cpp RISCVInsertVSETVLI.cpp RISCVInsertReadWriteCSR.cpp RISCVInsertWriteVXRM.cpp diff --git a/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp new file mode 100644 index 00000000000000..587090ec2cf922 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.cpp @@ -0,0 +1,5053 @@ +//===-- RISCVLoopUnrollAndRemainder.cpp - Loop Unrolling Pass +//------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a loop unrolling optimization pass specifically designed +// for Digital Signal Processing (DSP) algorithms. The pass targets common +// computational patterns found in various DSP operations including: +// - FIR and IIR filters +// - Convolution and correlation +// - Vector operations +// - Dot product calculations +// - Mathematical functions +// +// The pass performs the following main operations: +// 1. Identifies loops in DSP algorithm implementations +// 2. Unrolls the main computational loops, typically by a factor of 8 +// 3. Efficiently handles remainder iterations +// 4. Optimizes memory access patterns for improved cache utilization +// 5. Adjusts control flow and PHI nodes to support the unrolled structure +// 6. Performs cleanup and further optimization after unrolling +// +// This transformation can significantly improve performance for DSP algorithms +// by: +// - Increasing instruction-level parallelism +// - Improving cache utilization for data and coefficient access +// - Reducing loop overhead +// - Enabling better vectorization opportunities +// +// The pass is particularly effective for algorithms with intensive loop-based +// computations, where the main computational loop dominates the execution time. +// It aims to optimize both the main loop body and the handling of edge cases, +// providing a balance between performance and code size. +// +//===----------------------------------------------------------------------===// +#include "RISCVLoopUnrollAndRemainder.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopUnrollAnalyzer.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/DCE.h" +#include "llvm/Transforms/Scalar/DeadStoreElimination.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/LoopStrengthReduce.h" +#include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopPeel.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/SimplifyCFGOptions.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "riscv-loop-unroll-and-remainder" + +// Enumeration to represent different types of unrolling +enum class UnrollType { + DOTPROD, + ADD_ADDC_SUB_MUL_MULC_SQRT, + CONV_CCORR, + FIRD, + FIR, + CORR, + UNKNOWN +}; + +// Global variable to store the current unroll type +static UnrollType currentUnrollType = UnrollType::UNKNOWN; + +// Command line option to enable the RISCVLoopUnrollAndRemainder pass +cl::opt llvm::EnableRISCVLoopUnrollAndRemainder( + "riscv-loop-unroll-and-remainder", cl::init(false), + cl::desc("Enable loop unrolling and remainder specific loop")); + +// Helper function to get a basic block by name from a function +static BasicBlock *getBasicBlockByName(Function &F, StringRef Name) { + for (BasicBlock &BB : F) + if (BB.getName() == Name) + return &BB; + return nullptr; +} + +// Helper function to get the first ICmp instruction with a specific predicate +// in a basic block +static ICmpInst *getFirstICmpInstWithPredicate(BasicBlock *BB, + ICmpInst::Predicate Predicate) { + for (Instruction &I : *BB) { + if (auto *CI = dyn_cast(&I)) { + if (CI->getPredicate() == Predicate) { + return CI; + } + } + } + return nullptr; +} + +// Helper function to get the last ICmp instruction with a specific predicate in +// a basic block +static ICmpInst *getLastICmpInstWithPredicate(BasicBlock *BB, + ICmpInst::Predicate Predicate) { + ICmpInst *lastICmp = nullptr; + for (Instruction &I : *BB) { + if (auto *CI = dyn_cast(&I)) { + if (CI->getPredicate() == Predicate) { + lastICmp = CI; + } + } + } + return lastICmp; +} + +// Helper function to get the first ICmp instruction in a basic block +static ICmpInst *getFirstICmpInst(BasicBlock *BB) { + for (Instruction &I : *BB) { + if (auto *CI = dyn_cast(&I)) { + return CI; + } + } + return nullptr; +} + +// Helper function to get the last ICmp instruction in a basic block +static ICmpInst *getLastICmpInst(BasicBlock *BB) { + for (auto it = BB->rbegin(); it != BB->rend(); ++it) { + if (auto *icmp = dyn_cast(&*it)) { + return icmp; + } + } + return nullptr; +} + +// Helper function to get the first float PHI node in a basic block +static PHINode *getFirstFloatPhi(BasicBlock *BB) { + for (auto &Inst : *BB) { + if (auto *Phi = dyn_cast(&Inst)) { + if (Phi->getType()->isFloatTy()) { + return Phi; + } + } + } + return nullptr; +} + +// Helper function to get the last float PHI node in a basic block +static PHINode *getLastFloatPhi(BasicBlock *BB) { + for (auto it = BB->rbegin(); it != BB->rend(); ++it) { + if (auto *Phi = dyn_cast(&*it)) { + if (Phi->getType()->isFloatTy()) { + return Phi; + } + } + } + return nullptr; +} + +// Helper function to get the first 32-bit integer PHI node in a basic block +static PHINode *getFirstI32Phi(BasicBlock *BB) { + for (auto &Inst : *BB) { + if (auto *Phi = dyn_cast(&Inst)) { + if (Phi->getType()->isIntegerTy(32)) { + return Phi; + } + } + } + return nullptr; +} + +// Helper function to get the last 32-bit integer PHI node in a basic block +static PHINode *getLastI32Phi(BasicBlock *BB) { + for (auto it = BB->rbegin(); it != BB->rend(); ++it) { + if (auto *Phi = dyn_cast(&*it)) { + if (Phi->getType()->isIntegerTy(32)) { + return Phi; + } + } + } + return nullptr; +} + +// Helper function to get the last PHI node in a basic block +static PHINode *getLastPhi(BasicBlock *BB) { + for (auto it = BB->rbegin(); it != BB->rend(); ++it) { + if (auto *Phi = dyn_cast(&*it)) { + return Phi; + } + } + return nullptr; +} + +// Helper function to get the first CallInst with a specific name in a basic +// block +static CallInst *getFirstCallInstWithName(BasicBlock *BB, StringRef Name) { + for (Instruction &I : *BB) { + if (auto *Call = dyn_cast(&I)) { + if (Call->getCalledFunction() && + Call->getCalledFunction()->getName() == Name) { + return Call; + } + } + } + return nullptr; +} + +// Helper function to update operands of new instructions +static void updateOperands(SmallVector &NewInsts, + ValueToValueMapTy &ValueMap) { + for (Instruction *inst : NewInsts) { + for (unsigned i = 0; i < inst->getNumOperands(); i++) { + Value *op = inst->getOperand(i); + if (ValueMap.count(op)) { + inst->setOperand(i, ValueMap[op]); + } + } + } +} + +// Helper function to swap the successors of a terminator instruction +static void swapTerminatorSuccessors(BasicBlock *BB) { + if (auto *BI = dyn_cast(BB->getTerminator())) { + if (BI->isConditional() && BI->getNumSuccessors() == 2) { + BasicBlock *TrueSuccessor = BI->getSuccessor(0); + BasicBlock *FalseSuccessor = BI->getSuccessor(1); + BI->setSuccessor(0, FalseSuccessor); + BI->setSuccessor(1, TrueSuccessor); + } else { + llvm_unreachable("BB's terminator is not a conditional branch or doesn't " + "have two successors"); + } + } else { + llvm_unreachable("BB's terminator is not a branch instruction"); + } +} + +// Helper function to clone a basic block and update its relations +static BasicBlock *cloneBasicBlockWithRelations(BasicBlock *BB, + const std::string &NameSuffix, + Function *F) { + ValueToValueMapTy VMap; + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + + // Update instruction references in the new block + for (Instruction &I : *NewBB) { + // Update operands + for (Use &U : I.operands()) { + Value *V = U.get(); + Value *NewV = VMap[V]; + if (NewV) { + U.set(NewV); + } + } + + // Update PHI node basic block references + if (PHINode *PN = dyn_cast(&I)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *IncomingBB = PN->getIncomingBlock(i); + if (IncomingBB == BB) { + PN->setIncomingBlock(i, NewBB); + } else if (VMap.count(IncomingBB)) { + PN->setIncomingBlock(i, cast(VMap[IncomingBB])); + } + } + } + } + + return NewBB; +} + +// Helper function to unroll and duplicate a loop iteration +static Instruction *unrollAndDuplicateLoopIteration(LLVMContext &Ctx, + BasicBlock *BB, + IRBuilder<> &Builder, + unsigned int i) { + PHINode *IPhi = dyn_cast(&BB->front()); + BasicBlock::iterator BeginIt, EndIt, ToIt; + SmallVector newInsts; + ValueToValueMapTy ValueMap; + Instruction *Add = nullptr; + Instruction *tailcallfmuladd = nullptr; + Instruction *duplicatedPhiNode = nullptr; + + // Find the range of instructions to duplicate + for (Instruction &I : *BB) { + if (auto *phi = dyn_cast(&I)) { + if (phi->getType()->isFloatTy()) { + BeginIt = I.getIterator(); + } + } else if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) { + EndIt = std::next(I.getIterator()); + tailcallfmuladd = &I; + ToIt = std::next(EndIt); + break; + } + } + + assert(&*BeginIt && &*EndIt && "Failed to find instruction range"); + + // Clone and modify instructions + int arrayidx = 0; + for (auto it = BeginIt; it != EndIt; ++it) { + Instruction *newInst = it->clone(); + if (newInst->getOpcode() == Instruction::PHI) + newInst->setName("acc" + Twine(i)); + + if (auto *GEP = dyn_cast(newInst)) { + if (!Add) + Add = BinaryOperator::CreateDisjoint( + Instruction::Or, IPhi, ConstantInt::get(Type::getInt32Ty(Ctx), i), + "add" + Twine(i), BB); + + newInst->setName("arrayidx" + Twine(i) + "_" + Twine(arrayidx)); + newInst->setOperand(1, Add); + arrayidx++; + } + newInsts.push_back(newInst); + ValueMap[&*it] = newInst; + } + + // Update operands and insert new instructions + updateOperands(newInsts, ValueMap); + for (Instruction *newInst : newInsts) { + if (newInst->getOpcode() == Instruction::PHI) + duplicatedPhiNode = newInst->clone(); + newInst->insertInto(BB, BB->end()); + } + + return duplicatedPhiNode; +} + +// Helper function to move PHI nodes to the top of a basic block +static void movePHINodesToTop(BasicBlock &BB, + BasicBlock *ForBodyPreheaderBB = nullptr) { + SmallVector PHIs; + for (Instruction &I : BB) { + if (PHINode *PHI = dyn_cast(&I)) { + if (ForBodyPreheaderBB) + PHI->setIncomingBlock(1, ForBodyPreheaderBB); + PHIs.push_back(PHI); + } + } + + // Move PHI nodes in reverse order + for (auto it = PHIs.rbegin(); it != PHIs.rend(); ++it) { + (*it)->moveBefore(&BB.front()); + } +} + +// Helper function to update predecessors to point to a new preheader +static void updatePredecessorsToPreheader(BasicBlock *ForBody, + BasicBlock *ForBodyPreheader) { + SmallVector predecessors_bb; + for (auto *Pred : predecessors(ForBody)) { + if (Pred != ForBody) + predecessors_bb.push_back(Pred); + } + + for (BasicBlock *Pred : predecessors_bb) { + Instruction *TI = Pred->getTerminator(); + for (unsigned i = 0; i < TI->getNumSuccessors(); ++i) { + if (TI->getSuccessor(i) == ForBody) { + TI->setSuccessor(i, ForBodyPreheader); + } + } + } + + if (!ForBodyPreheader->getTerminator()) { + BranchInst::Create(ForBody, ForBodyPreheader); + } +} + +// Helper function to get the 'len' value from the entry block +static Value *getLenFromEntryBlock(Function &F) { + ICmpInst *ICmp = nullptr; + for (BasicBlock &BB : F) { + ICmp = getFirstICmpInstWithPredicate(&BB, ICmpInst::ICMP_SGT); + if (ICmp) + break; + } + + assert(ICmp && "icmp sgt instruction not found"); + return ICmp->getOperand(0); +} + +// Helper function to find specific instructions in a basic block +static std::tuple +findKeyInstructions(BasicBlock *ForBody) { + PHINode *ThirdPHI = nullptr; + CallInst *callInst = nullptr; + BinaryOperator *addInst = nullptr; + int PHICount = 0; + + for (Instruction &I : *ForBody) { + if (auto *PHI = dyn_cast(&I)) { + PHICount++; + if (PHICount == 3) { + ThirdPHI = PHI; + } + } else if (auto *ci = dyn_cast(&I)) { + callInst = ci; + } else if (auto *BinOp = dyn_cast(&I)) { + if (BinOp->getOpcode() == Instruction::Add) { + addInst = BinOp; + } + } + } + + return std::make_tuple(ThirdPHI, callInst, addInst); +} + +// Helper function to rename instructions +static void renameInstruction(Instruction *inst) { + if (inst->getOpcode() == Instruction::PHI) { + inst->setName("acc"); + } else if (inst->getOpcode() == Instruction::GetElementPtr) { + inst->setName("arrayidx"); + } +} + +// Helper function to set add instruction in for body +static void setAddInForBody(Instruction *inst, Instruction *Add, + Instruction *InsertBefore) { + if (inst->getOpcode() == Instruction::PHI) { + Add->moveBefore(InsertBefore); + } else if (inst->getOpcode() == Instruction::GetElementPtr) { + inst->setOperand(1, Add); + } +} + +// Helper function to copy and remap instructions +static void copyAndRemapInstructions(Instruction *StartInst, + Instruction *EndInst, + Instruction *InsertBefore, + Instruction *Add) { + ValueToValueMapTy ValueMap; + SmallVector NewInsts; + + for (auto it = StartInst->getIterator(); &*it != EndInst; ++it) { + Instruction *newInst = it->clone(); + if (auto *BinOp = dyn_cast(newInst)) { + if (BinOp->getOpcode() == Instruction::Add) { + continue; + } + } + NewInsts.push_back(newInst); + ValueMap[&*it] = newInst; + } + + updateOperands(NewInsts, ValueMap); + + for (Instruction *newInst : NewInsts) { + renameInstruction(newInst); + newInst->insertBefore(InsertBefore); + setAddInForBody(newInst, Add, InsertBefore); + } +} + +// Helper function to preprocess the cloned for body +static void preProcessClonedForBody(BasicBlock *ClonedForBody, Value *sub) { + Instruction *addInst = nullptr; + for (Instruction &I : *ClonedForBody) { + if (auto *BinOp = dyn_cast(&I)) { + if (BinOp->getOpcode() == Instruction::Add) { + BinOp->setOperand(1, ConstantInt::get(BinOp->getType(), 8)); + addInst = BinOp; + } + } + if (auto *icmp = dyn_cast(&I)) { + icmp->setPredicate(CmpInst::Predicate::ICMP_SLT); + icmp->setOperand(0, addInst); + icmp->setOperand(1, sub); + icmp->setName("cmp11"); + } + } + LLVM_DEBUG(ClonedForBody->dump()); +} + +// Helper function to modify getelementptr instructions +static void modifyGetElementPtr(BasicBlock *BB) { + SmallVector gepInsts; + Value *firstGEPOperand0 = nullptr; + Value *secondGEPOperand1 = nullptr; + + for (Instruction &I : *BB) { + if (auto *GEP = dyn_cast(&I)) { + gepInsts.push_back(GEP); + } + } + + if (gepInsts.size() < 8 || gepInsts.size() % 2 != 0) { + return; + } + + firstGEPOperand0 = gepInsts[0]; + secondGEPOperand1 = gepInsts[1]; + + for (size_t i = 2; i < gepInsts.size(); ++i) { + if (i % 2 == 0) { + if (i < gepInsts.size() - 2) { + gepInsts[i]->setOperand(0, firstGEPOperand0); + } + } else { + gepInsts[i]->setOperand(0, secondGEPOperand1); + } + + if (i == 14) + continue; + + Instruction *operand1 = dyn_cast(gepInsts[i]->getOperand(1)); + gepInsts[i]->setOperand( + 1, ConstantInt::get(Type::getInt32Ty(BB->getContext()), i / 2)); + if (operand1 && operand1->use_empty()) { + operand1->eraseFromParent(); + } + } +} + +// Helper function to check if a PHI node has an incoming value of zero +static bool isIncomingValueZeroOfPhi(PHINode *phi) { + return phi->getType()->isIntegerTy(32) && + isa(phi->getIncomingValue(0)) && + cast(phi->getIncomingValue(0))->isZero(); +} + +// Helper function to find and set add instructions +static std::pair +findAndSetAddInstructions(BasicBlock *ClonedForBody) { + Instruction *FirstAdd = nullptr; + Instruction *SecondAdd = nullptr; + + for (Instruction &I : *ClonedForBody) { + if (BinaryOperator *BinOp = dyn_cast(&I)) { + if (BinOp->getOpcode() == Instruction::Add) { + if (!FirstAdd) { + FirstAdd = &I; + FirstAdd->setHasNoSignedWrap(true); + } else if (!SecondAdd) { + SecondAdd = &I; + break; + } + } + } + } + assert(FirstAdd && SecondAdd && "Failed to find matching add instructions"); + return std::make_pair(FirstAdd, SecondAdd); +} + +// Helper functions for PHI node manipulation + +static PHINode *findZeroInitializedPHI(BasicBlock *block) { + for (Instruction &I : *block) { + if (PHINode *phi = dyn_cast(&I)) { + if (isIncomingValueZeroOfPhi(phi)) { + return phi; + } + } + } + return nullptr; +} + +static PHINode *findIntegerPHI(BasicBlock *block) { + for (Instruction &I : *block) { + if (PHINode *phi = dyn_cast(&I)) { + if (phi->getType()->isIntegerTy(32) && !isIncomingValueZeroOfPhi(phi)) { + return phi; + } + } + } + return nullptr; +} + +// Helper function to unroll loop body +static void unrollLoopBody(BasicBlock *block, PHINode *thirdPHI, + Instruction *callInst, Instruction *addInst, + PHINode *zeroInitializedPHI, LLVMContext &context) { + for (int i = 1; i < 8; i++) { + Instruction *add = BinaryOperator::CreateDisjoint( + Instruction::Or, zeroInitializedPHI, + ConstantInt::get(Type::getInt32Ty(context), i), "add" + Twine(i), + block); + copyAndRemapInstructions(thirdPHI, callInst->getNextNode(), addInst, add); + } +} + +// Helper function to update add instruction +static void updateAddInstruction(Instruction *addInst, PHINode *integerPHI, + LLVMContext &context) { + if (addInst) { + addInst->setOperand(1, ConstantInt::get(Type::getInt32Ty(context), 8)); + addInst->setOperand(0, integerPHI); + } +} + +// Helper function to update block terminator +static void updateBlockTerminator(BasicBlock *block, BasicBlock *successor) { + Instruction *terminator = block->getTerminator(); + terminator->setSuccessor(0, block); + terminator->setSuccessor(1, successor); +} + +// Helper function to modify getelementptr for unrolling +static void modifyGetElementPtrForUnrolling(BasicBlock *block) { + SmallVector gepInsts; + for (Instruction &I : *block) { + if (auto *GEP = dyn_cast(&I)) { + gepInsts.push_back(GEP); + } + } + + for (size_t i = 2; i < gepInsts.size(); i += 2) { + gepInsts[i]->setOperand(0, gepInsts[0]); + gepInsts[i]->setOperand( + 1, ConstantInt::get(Type::getInt32Ty(block->getContext()), i / 2)); + } +} + +// Helper function to handle add instructions +static void handleAddInstructions(BasicBlock *block, unsigned int unrollFactor, + PHINode *zeroInitializedPHI, + LLVMContext &context) { + auto [firstAdd, secondAdd] = findAndSetAddInstructions(block); + + if (firstAdd && secondAdd) { + firstAdd->moveBefore(secondAdd); + + if (unrollFactor == 1) { + firstAdd->setOperand(1, ConstantInt::get(Type::getInt32Ty(context), 8)); + secondAdd->setOperand(0, zeroInitializedPHI); + } + } +} + +// Function to unroll the cloned for loop body +static void unrollClonedForBody(BasicBlock *clonedForBody, + BasicBlock *forCondPreheader, + unsigned int unrollFactor = 0) { + Function *function = clonedForBody->getParent(); + LLVMContext &context = function->getContext(); + + // Find key instructions in the cloned for body + auto [thirdPHI, callInst, addInst] = findKeyInstructions(clonedForBody); + PHINode *zeroInitializedPHI = findZeroInitializedPHI(clonedForBody); + PHINode *integerPHI = findIntegerPHI(clonedForBody); + + assert(zeroInitializedPHI && "No matching zero-initialized PHI node found"); + + // Unroll the loop body if key instructions are found + if (thirdPHI && callInst) { + unrollLoopBody(clonedForBody, thirdPHI, callInst, addInst, + zeroInitializedPHI, context); + } + + // Update the add instruction + updateAddInstruction(addInst, integerPHI, context); + + // Update the basic block terminator + updateBlockTerminator(clonedForBody, forCondPreheader); + + // Move PHI nodes to the top of the basic block + movePHINodesToTop(*clonedForBody); + + // Modify getelementptr instructions based on the unroll factor + if (unrollFactor == 0) { + modifyGetElementPtr(clonedForBody); + } else { + modifyGetElementPtrForUnrolling(clonedForBody); + } + + // Handle add instructions + handleAddInstructions(clonedForBody, unrollFactor, zeroInitializedPHI, + context); +} + +// Function to check if a call instruction can be moved +static bool canMoveCallInstruction(CallInst *callInst, + Instruction *insertPoint) { + for (unsigned i = 0; i < callInst->getNumOperands(); ++i) { + if (auto *operandInst = dyn_cast(callInst->getOperand(i))) { + if (operandInst->getParent() == callInst->getParent() && + insertPoint->comesBefore(operandInst)) { + return false; + } + } + } + return true; +} + +// Function to group and reorder instructions in a basic block +static void groupAndReorderInstructions(BasicBlock *clonedForBody) { + // Collect different types of instructions + SmallVector phiNodes; + SmallVector orInsts, gepInsts, loadInsts, storeInsts, mulInsts, + addInsts, subInsts, callInsts, ashrInsts, faddInsts, fmulInsts, fsubInsts; + + // Categorize instructions by type + for (Instruction &I : *clonedForBody) { + if (auto *phi = dyn_cast(&I)) { + phiNodes.push_back(phi); + } else if (I.getOpcode() == Instruction::Or) { + orInsts.push_back(&I); + } else if (isa(&I)) { + gepInsts.push_back(&I); + } else if (isa(&I)) { + loadInsts.push_back(&I); + } else if (isa(&I)) { + storeInsts.push_back(&I); + } else if (I.getOpcode() == Instruction::Mul) { + mulInsts.push_back(&I); + } else if (isa(&I)) { + callInsts.push_back(&I); + } else if (I.getOpcode() == Instruction::Add) { + addInsts.push_back(&I); + } else if (I.getOpcode() == Instruction::Sub) { + subInsts.push_back(&I); + } else if (I.getOpcode() == Instruction::FAdd) { + faddInsts.push_back(&I); + } else if (I.getOpcode() == Instruction::FMul) { + fmulInsts.push_back(&I); + } else if (I.getOpcode() == Instruction::FSub) { + fsubInsts.push_back(&I); + } else if (I.getOpcode() == Instruction::AShr) { + return; + } + } + + // If no PHI nodes are found, return + if (phiNodes.empty()) { + return; + } + + // Reorder instructions + Instruction *insertPoint = phiNodes.back()->getNextNode(); + bool canMoveCallInst = + callInsts.empty() || + canMoveCallInstruction(dyn_cast(callInsts[0]), insertPoint); + + auto moveInstructions = [&insertPoint](SmallVector &insts) { + for (auto *inst : insts) { + inst->moveBefore(insertPoint); + insertPoint = inst->getNextNode(); + } + }; + + // Move instructions in the desired order + moveInstructions(mulInsts); + moveInstructions(addInsts); + moveInstructions(orInsts); + moveInstructions(subInsts); + moveInstructions(gepInsts); + moveInstructions(loadInsts); + moveInstructions(faddInsts); + moveInstructions(fmulInsts); + moveInstructions(fsubInsts); + if (canMoveCallInst) { + moveInstructions(callInsts); + } +} + +// Function to transform a single loop depth (currently suitable for +// dotprod/dotprode example) +static bool transformOneLoopDepth(Function &F) { + LLVMContext &ctx = F.getContext(); + bool changed = false; + + // Get necessary basic blocks and values + Value *len = getLenFromEntryBlock(F); + BasicBlock *entryBB = &F.getEntryBlock(); + BasicBlock *forBodyBB = getBasicBlockByName(F, "for.body"); + BasicBlock *forBodyNewBB = getBasicBlockByName(F, "for.body.clone"); + BasicBlock *ifEnd = getBasicBlockByName(F, "if.end"); + BasicBlock *forCond46PreheaderBB = + getBasicBlockByName(F, "for.cond.preheader"); + + assert(forBodyBB && "Expected to find for.body!"); + assert(forBodyNewBB && "Expected to find for.body.clone!"); + assert(ifEnd && "Expected to find if.end!"); + assert(forCond46PreheaderBB && "Expected to find for.cond.preheader!"); + + // Create new basic blocks + BasicBlock *forCondPreheaderBB = + BasicBlock::Create(F.getContext(), "for.cond.preheader", &F, forBodyBB); + BasicBlock *forBodyPreheaderBB = + BasicBlock::Create(F.getContext(), "for.body.preheader", &F, forBodyBB); + BasicBlock *forCond31PreheaderBB = + BasicBlock::Create(F.getContext(), "for.cond31.preheader", &F, forBodyBB); + BasicBlock *forBody33BB = cloneBasicBlockWithRelations(forBodyBB, "33", &F); + forBody33BB->setName("for.body33"); + forBody33BB->moveAfter(forBodyBB); + BasicBlock *forEnd37BB = + BasicBlock::Create(F.getContext(), "for.end37", &F, forBodyNewBB); + + // Add instructions to forCondPreheaderBB + IRBuilder<> builder(forCondPreheaderBB); + Value *negativeSeven = ConstantInt::get(Type::getInt32Ty(F.getContext()), -7); + Value *sub = builder.CreateNSWAdd(len, negativeSeven, "sub"); + Value *seven = ConstantInt::get(Type::getInt32Ty(F.getContext()), 7); + Value *cmp1113 = builder.CreateICmpUGT(len, seven, "cmp1113"); + builder.CreateCondBr(cmp1113, forBodyPreheaderBB, forCond31PreheaderBB); + + // Add instructions to forBodyPreheaderBB + builder.SetInsertPoint(forBodyPreheaderBB); + Value *mask = ConstantInt::get(Type::getInt32Ty(F.getContext()), 2147483640); + Value *andValue = builder.CreateAnd(len, mask, ""); + builder.CreateBr(forBodyBB); + + // Modify for.body + PHINode *iPhi = dyn_cast(&forBodyBB->front()); + iPhi->setName("i.0122"); + + // copy first float phinode from forBodyBB to forCond31PreheaderBB + PHINode *firstFloatPhi = getFirstFloatPhi(forBodyBB); + PHINode *acc00Lcssa = PHINode::Create(firstFloatPhi->getType(), 2, + "acc0.0.lcssa", forCond31PreheaderBB); + acc00Lcssa->addIncoming(firstFloatPhi->getIncomingValue(0), + firstFloatPhi->getIncomingBlock(0)); + acc00Lcssa->addIncoming(firstFloatPhi->getIncomingValue(1), + forCondPreheaderBB); + // Unroll and duplicate loop iterations + SmallVector instructions; + for (int i = 0; i < 7; i++) { + Instruction *copyedPhiNode = + unrollAndDuplicateLoopIteration(ctx, forBodyBB, builder, i + 1); + if (PHINode *phi = dyn_cast(copyedPhiNode)) { + phi->setName("acc" + Twine(i + 1) + ".0.lcssa"); + phi->setIncomingBlock(1, forCondPreheaderBB); + phi->insertInto(forCond31PreheaderBB, forCond31PreheaderBB->end()); + instructions.push_back(phi); + } + } + + // Update for.body terminator + Instruction *incInst = nullptr; + MDNode *loopMD = nullptr; + for (auto &I : *forBodyBB) { + if (I.getOpcode() == Instruction::Add) { + incInst = &I; + Instruction *icmp = I.getNextNode(); + Instruction *br = icmp->getNextNode(); + assert(icmp->getOpcode() == Instruction::ICmp && + br->getOpcode() == Instruction::Br && + "Unexpected instruction sequence"); + I.moveAfter(&forBodyBB->back()); + loopMD = br->getMetadata(LLVMContext::MD_loop); + br->eraseFromParent(); + icmp->eraseFromParent(); + break; + } + } + + // Modify add instruction + incInst->setOperand(1, ConstantInt::get(Type::getInt32Ty(F.getContext()), 8)); + incInst->setName("add30"); + + builder.SetInsertPoint(forBodyBB); + Value *cmp1 = builder.CreateICmpSLT(incInst, sub, "cmp1"); + BranchInst *newBr = + builder.CreateCondBr(cmp1, forBodyBB, forCond31PreheaderBB); + newBr->setMetadata(LLVMContext::MD_loop, loopMD); + + movePHINodesToTop(*forBodyBB, forBodyPreheaderBB); + + // Add instructions to forCond31PreheaderBB + builder.SetInsertPoint(forCond31PreheaderBB); + PHINode *i0Lcssa = + builder.CreatePHI(Type::getInt32Ty(F.getContext()), 0, "i.0.lcssa"); + i0Lcssa->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), + forCondPreheaderBB); + i0Lcssa->addIncoming(andValue, forBodyBB); + Value *cmp32132 = builder.CreateICmpSLT(i0Lcssa, len, "cmp32132"); + builder.CreateCondBr(cmp32132, forBody33BB, forEnd37BB); + + // Modify forBody33BB + Instruction *tempInstr = nullptr; + for (auto &I : *forBody33BB) { + if (PHINode *phi = dyn_cast(&I)) { + if (phi->getType()->isIntegerTy(32)) { + phi->setIncomingValue(1, i0Lcssa); + phi->setIncomingBlock(1, forCond31PreheaderBB); + } else if (phi->getType()->isFloatTy()) { + phi->setIncomingValue(1, acc00Lcssa); + phi->setIncomingBlock(1, forCond31PreheaderBB); + tempInstr = phi; + } + } + } + + // Modify forEnd37BB + Instruction *acc01Lcssa = tempInstr->clone(); + acc01Lcssa->setName("acc0.1.lcssa"); + acc01Lcssa->insertInto(forEnd37BB, forEnd37BB->end()); + builder.SetInsertPoint(forEnd37BB); + + // Create pairs of floating-point additions + Value *sum01 = builder.CreateFAdd(acc01Lcssa, instructions[0], "sum01"); + Value *sum23 = builder.CreateFAdd(instructions[1], instructions[2], "sum23"); + Value *sum45 = builder.CreateFAdd(instructions[3], instructions[4], "sum45"); + Value *sum67 = builder.CreateFAdd(instructions[5], instructions[6], "sum67"); + + // Combine pairs + Value *sum0123 = builder.CreateFAdd(sum01, sum23, "sum0123"); + Value *sum4567 = builder.CreateFAdd(sum45, sum67, "sum4567"); + + // Final addition + Value *currentAdd = builder.CreateFAdd(sum0123, sum4567, "add44"); + builder.CreateBr(ifEnd); + + // Modify entry basic block + BranchInst *entryBi = dyn_cast(entryBB->getTerminator()); + entryBi->setSuccessor(0, forCondPreheaderBB); + entryBi->setSuccessor(1, forCond46PreheaderBB); + + // Modify forCond46PreheaderBB + forCond46PreheaderBB->getTerminator()->getPrevNode()->setName("cmp47110"); + + // Modify for.body33 + BranchInst *forBody33Bi = dyn_cast(forBody33BB->getTerminator()); + forBody33Bi->setSuccessor(0, forEnd37BB); + forBody33Bi->setSuccessor(1, forBody33BB); + + // Modify if.end + PHINode *ifEndPhi = dyn_cast(&ifEnd->front()); + ifEndPhi->setIncomingValue(1, currentAdd); + ifEndPhi->setIncomingBlock(1, forEnd37BB); + + changed = true; + return changed; +} + +// Function to unroll the cloned for.cond.preheader +static void unrollClonedForCondPreheader(BasicBlock *clonedForBody, + BasicBlock *clonedForCondPreheader, + BasicBlock *forCondPreheader) { + Function *F = clonedForBody->getParent(); + BasicBlock *forBody = getBasicBlockByName(*F, "for.body"); + assert(forBody && "Expected to find for.body!"); + + // Find PHI instructions in clonedForBody + SmallVector phiNodes; + for (Instruction &I : *clonedForBody) { + if (PHINode *phi = dyn_cast(&I)) { + phiNodes.push_back(phi); + } + } + + // Remove unused PHI nodes in clonedForCondPreheader + SmallVector unusedPhiNodes; + for (Instruction &I : *clonedForCondPreheader) { + if (PHINode *phi = dyn_cast(&I)) { + if (phi->use_empty()) { + unusedPhiNodes.push_back(phi); + } + } + } + for (PHINode *phi : unusedPhiNodes) { + phi->eraseFromParent(); + } + + // Clone PHI instructions to the beginning of clonedForCondPreheader + Instruction *insertPoint = &clonedForCondPreheader->front(); + SmallVector clonedPhiNodes; + for (PHINode *phi : phiNodes) { + PHINode *clonedPhi = cast(phi->clone()); + clonedPhi->setName(phi->getName() + ".clone"); + clonedPhi->setIncomingBlock(0, forBody); + clonedPhi->insertBefore(insertPoint); + insertPoint = clonedPhi->getNextNode(); + clonedPhiNodes.push_back(clonedPhi); + } + + // Find and clone the unique icmp instruction in forBody + Value *specStoreSelect = nullptr; + Instruction *cmpSlt = nullptr; + for (Instruction &I : *forBody) { + if (auto *icmp = dyn_cast(&I)) { + specStoreSelect = icmp->getOperand(0); + cmpSlt = icmp->clone(); + cmpSlt->setName("cmp_slt"); + cmpSlt->insertAfter(insertPoint); + break; + } + } + assert(specStoreSelect && "Failed to find icmp instruction in ForBody"); + + // Replace the existing icmp in clonedForCondPreheader + for (Instruction &I : *clonedForCondPreheader) { + if (auto *icmp = dyn_cast(&I)) { + icmp->replaceAllUsesWith(cmpSlt); + icmp->eraseFromParent(); + break; + } + } + + // Set the operand of cmp_slt to the first cloned PHI node + cmpSlt->setOperand(0, clonedPhiNodes[0]); + + // Update the successor of clonedForCondPreheader + clonedForCondPreheader->getTerminator()->setSuccessor(1, forCondPreheader); +} + +static std::tuple +modifyForBodyPreheader(BasicBlock *ForBodyPreheader, + BasicBlock *ClonedForCondPreheader) { + PHINode *TargetPHI = nullptr; + PHINode *TargetPHI2 = nullptr; + PHINode *TargetPHI3 = nullptr; + for (Instruction &I : *ClonedForCondPreheader) { + if (auto *phi = dyn_cast(&I)) { + if (phi->getType()->isIntegerTy(32)) { + if (isIncomingValueZeroOfPhi(phi)) { + // Found the target PHI node + TargetPHI = phi; + } else { + TargetPHI2 = phi; + } + } else if (phi->getType()->isFloatTy()) { + if (TargetPHI3 == nullptr) { + TargetPHI3 = phi; + break; + } + } + } + } + BinaryOperator *NewSub = nullptr; + for (Instruction &I : *ForBodyPreheader) { + if (auto *BinOp = dyn_cast(&I)) { + if (BinOp->getOpcode() == Instruction::Sub) { + // Change to add + NewSub = BinaryOperator::CreateAdd(BinOp->getOperand(0), TargetPHI, + BinOp->getName(), BinOp); + BinOp->replaceAllUsesWith(NewSub); + BinOp->eraseFromParent(); + break; + } + } + } + + ForBodyPreheader->moveAfter(ClonedForCondPreheader); + assert(NewSub && "NewSub should not be nullptr"); + return std::make_tuple(NewSub, TargetPHI2, TargetPHI3); +} + +static Value *expandForCondPreheader( + BasicBlock *ForBody, BasicBlock *ForCondPreheader, + BasicBlock *ClonedForCondPreheader, + std::tuple NewSubAndTargetPHI3) { + Instruction *TargetInst = + getFirstCallInstWithName(ForBody, "llvm.fmuladd.f32"); + assert(TargetInst && "TargetInst not found"); + Value *NewSub = std::get<0>(NewSubAndTargetPHI3); + Value *TargetPHI2 = std::get<1>(NewSubAndTargetPHI3); + Value *TargetPHI3 = std::get<2>(NewSubAndTargetPHI3); + // Create new .loopexit basic block + BasicBlock *LoopExit = BasicBlock::Create( + ForCondPreheader->getContext(), ForCondPreheader->getName() + ".loopexit", + ForCondPreheader->getParent(), ForCondPreheader); + + // Create new sub instruction in .loopexit block + IRBuilder<> Builder(LoopExit); + Value *NewSubInst = Builder.CreateSub(NewSub, TargetPHI2); + + // Add unconditional branch to ForCondPreheader + Builder.CreateBr(ForCondPreheader); + + // Find the target PHI node in ClonedForCondPreheader + PHINode *TargetPHI = nullptr; + for (PHINode &Phi : ClonedForCondPreheader->phis()) { + if (isIncomingValueZeroOfPhi(&Phi)) { + TargetPHI = Φ + break; + } + } + + // Ensure we found the target PHI node + assert(TargetPHI && + "Failed to find target PHI node in ClonedForCondPreheader"); + + // Update the incoming value of the PHI nodes in ForCondPreheader to the + // result of the new sub instruction + for (PHINode &Phi : ForCondPreheader->phis()) { + if (Phi.getType()->isIntegerTy(32)) { + Phi.setIncomingValue(0, TargetPHI); + Phi.setIncomingBlock(0, ClonedForCondPreheader); + Phi.setIncomingValue(1, NewSubInst); + Phi.setIncomingBlock(1, LoopExit); + } else if (Phi.getType()->isFloatTy()) { + Phi.setIncomingValue(0, TargetPHI3); + Phi.setIncomingBlock(0, ClonedForCondPreheader); + // Phi.setIncomingValue(1, TargetInst); + Phi.setIncomingBlock(1, LoopExit); + } + } + + // Get the icmp instruction in ForCondPreheader + ICmpInst *icmpInst = getFirstICmpInst(ForCondPreheader); + + // Ensure we found the icmp instruction + assert(icmpInst && "Failed to find icmp instruction in ForCondPreheader"); + + // Set the operand 1 of icmpInst to constant 7 + LLVMContext &Ctx = ForCondPreheader->getContext(); + Value *const7 = ConstantInt::get(Type::getInt32Ty(Ctx), 7); + icmpInst->setOperand(1, const7); + + // Create a new add nsw instruction before icmpInst, with operand 0 the same + // as icmpInst, and operand 1 as -7. This instruction will be used as the + // return value of the function + Value *constNeg7 = ConstantInt::get(Type::getInt32Ty(Ctx), -7); + IRBuilder<> BuilderBeforeICmp(icmpInst); + Value *AddInst = + BuilderBeforeICmp.CreateNSWAdd(icmpInst->getOperand(0), constNeg7); + + ForBody->getTerminator()->setSuccessor(0, LoopExit); + + return AddInst; +} + +static void updateRealForBody(Function &F, Value *sub) { + BasicBlock *ForBody = getBasicBlockByName(F, "for.body"); + assert(ForBody && "Expected to find for.body!"); + ICmpInst *lastICmp = + getLastICmpInstWithPredicate(ForBody, ICmpInst::ICMP_SLT); + if (lastICmp) { + lastICmp->setOperand(1, sub); + } +} + +static void modifyForBody(BasicBlock *ClonedForCondPreheader, + BasicBlock *ForBody) { + // Find the unique float type PHI node in ForBody + PHINode *FloatPhiInForBody = getFirstFloatPhi(ForBody); + assert(FloatPhiInForBody && "Failed to find float type PHI node in ForBody"); + // Find the first float type PHI node in ClonedForCondPreheader + PHINode *FirstFloatPhiInClonedForCondPreheader = + getFirstFloatPhi(ClonedForCondPreheader); + assert(FloatPhiInForBody && "Failed to find float type PHI node in ForBody"); + // Set the incoming value of the float type PHI node in ForBody to the float + // type PHI node in ClonedForCondPreheader + FloatPhiInForBody->setIncomingValue(0, FirstFloatPhiInClonedForCondPreheader); + + // Find the unique icmp eq instruction in ForBody + ICmpInst *IcmpEq = getFirstICmpInstWithPredicate(ForBody, ICmpInst::ICMP_EQ); + + // Ensure we found the icmp eq instruction + assert(IcmpEq && "Failed to find icmp eq instruction in ForBody"); + + // Get the original operand 1 + Value *OriginalOperand1 = IcmpEq->getOperand(1); + + // Ensure the original operand 1 is an instruction + if (Instruction *OriginalOperand1Inst = + dyn_cast(OriginalOperand1)) { + // Set operand 1 to the operand 0 of the original operand 1 instruction + IcmpEq->setOperand(1, OriginalOperand1Inst->getOperand(0)); + } else { + assert(false && "The original operand 1 is not an instruction, " + "cannot get its operand 0\n"); + } + + // Find the phi i32 incoming value that is a variable in + // ClonedForCondPreheader + PHINode *TargetPHI = nullptr; + PHINode *TargetPHI2 = nullptr; + for (Instruction &I : *ClonedForCondPreheader) { + if (PHINode *Phi = dyn_cast(&I)) { + if (isIncomingValueZeroOfPhi(Phi)) { + TargetPHI = Phi; + } else { + TargetPHI2 = Phi; + } + if (TargetPHI && TargetPHI2) + break; + } + } + + // Ensure we found the target PHI node + assert(TargetPHI && + "Failed to find the target PHI node in ClonedForCondPreheader"); + + // Find the phi i32 incoming value that is a variable in ForBody + PHINode *TargetPHIInForBody = nullptr; + PHINode *TargetPHIInForBody2 = nullptr; + for (Instruction &I : *ForBody) { + if (PHINode *Phi = dyn_cast(&I)) { + if (isIncomingValueZeroOfPhi(Phi)) { + TargetPHIInForBody = Phi; + } else { + TargetPHIInForBody2 = Phi; + } + if (TargetPHIInForBody && TargetPHIInForBody2) + break; + } + } + + // Ensure that the target PHI nodes are found + assert(TargetPHIInForBody && TargetPHIInForBody2 && + "Failed to find matching PHI nodes in ForBody"); + + // Set the incoming value of the PHI nodes found in ForBody + // to the PHI nodes found in ClonedForCondPreheader + TargetPHIInForBody->setIncomingValue(0, TargetPHI); + TargetPHIInForBody2->setIncomingValue(0, TargetPHI2); + + IcmpEq->setOperand(0, TargetPHIInForBody2->getIncomingValue(1)); +} + +static void insertUnusedInstructionsBeforeIcmp(PHINode *phiI32InClonedForBody, + ICmpInst *lastIcmpEq) { + for (Use &U : phiI32InClonedForBody->uses()) { + if (Instruction *Used = dyn_cast(U.getUser())) { + if (Used->getParent() == nullptr) { + if (Used->use_empty()) { + Used->insertBefore(lastIcmpEq); + } + } + } + } +} + +static void modifyClonedForBody(BasicBlock *ClonedForBody) { + + ICmpInst *lastIcmpEq = getLastICmpInst(ClonedForBody); + assert(lastIcmpEq && + "Failed to find last icmp eq instruction in ClonedForBody"); + + PHINode *phiI32InClonedForBody = nullptr; + for (auto &Inst : *ClonedForBody) { + if (PHINode *Phi = dyn_cast(&Inst)) { + if (isIncomingValueZeroOfPhi(Phi)) { + phiI32InClonedForBody = Phi; + insertUnusedInstructionsBeforeIcmp(phiI32InClonedForBody, lastIcmpEq); + } + } + } + + // Ensure that the phi i32 node is found + assert(phiI32InClonedForBody && "phi i32 node not found in ClonedForBody"); +} + +static BasicBlock *getFirstSuccessorOfForBody(BasicBlock *ForBody) { + BasicBlock *ForCondPreheader = nullptr; + assert(succ_size(ForBody) == 2 && "ForBody should have 2 successors"); + for (auto *succ : successors(ForBody)) { + ForCondPreheader = succ; + break; + } + return ForCondPreheader; +} + +static std::tuple +cloneThreeBB(BasicBlock *ForBodyPreheader, BasicBlock *ForBody, + BasicBlock *ForCondPreheader, Function &F) { + ValueToValueMapTy VMap; + SmallVector NewBlocks; + + BasicBlock *ClonedForBodyPreheader = + CloneBasicBlock(ForBodyPreheader, VMap, ".modify", &F); + BasicBlock *ClonedForBody = CloneBasicBlock(ForBody, VMap, ".modify", &F); + BasicBlock *ClonedForCondPreheader = + CloneBasicBlock(ForCondPreheader, VMap, ".modify", &F); + + VMap[ForBodyPreheader] = ClonedForBodyPreheader; + VMap[ForBody] = ClonedForBody; + VMap[ForCondPreheader] = ClonedForCondPreheader; + + // Remap instructions and PHI nodes in the new loop + remapInstructionsInBlocks( + {ClonedForBodyPreheader, ClonedForBody, ClonedForCondPreheader}, VMap); + return std::make_tuple(ClonedForBodyPreheader, ClonedForBody, + ClonedForCondPreheader); +} + +static std::tuple +modifyFirstForBody(Loop *L, Function &F, BasicBlock *ForBody, Value *sub) { + + BasicBlock *ForBodyPreheader = L->getLoopPreheader(); + + // Find the predecessor of ForBodyPreheader + BasicBlock *PreForBody = nullptr; + assert(pred_size(ForBodyPreheader) == 1 && + "ForBodyPreheader should have only one predecessor"); + for (auto *Pred : predecessors(ForBodyPreheader)) { + PreForBody = Pred; + } + + // Find the first successor of ForBody, it should have two + BasicBlock *ForCondPreheader = getFirstSuccessorOfForBody(ForBody); + + std::tuple ClonedBBs = + cloneThreeBB(ForBodyPreheader, ForBody, ForCondPreheader, F); + BasicBlock *ClonedForBodyPreheader = std::get<0>(ClonedBBs); + BasicBlock *ClonedForBody = std::get<1>(ClonedBBs); + BasicBlock *ClonedForCondPreheader = std::get<2>(ClonedBBs); + + /* insert 2 cloned blocks between PreForBody and ForBody */ + // for.body -> for.body12.lr.ph + PreForBody->getTerminator()->setSuccessor(0, ClonedForBodyPreheader); + ClonedForBodyPreheader->moveAfter(PreForBody); + // for.body12.lr.ph -> for.body12 + ClonedForBodyPreheader->getTerminator()->setSuccessor(0, ClonedForBody); + + // for.body12 -> for.cond59.preheader + ClonedForBody->moveAfter(ClonedForBodyPreheader); + + // for.cond59.preheader -> for.body62.lr.ph + ClonedForCondPreheader->getTerminator()->setSuccessor(0, ForBodyPreheader); + + // for.cond59.preheader -> for.cond71.preheader + ClonedForCondPreheader->getTerminator()->setSuccessor(1, + ClonedForCondPreheader); + ClonedForCondPreheader->moveAfter(ClonedForBodyPreheader); + // for.body -> for.cond71.preheader + PreForBody->getTerminator()->setSuccessor(1, ClonedForCondPreheader); + + preProcessClonedForBody(ClonedForBody, sub); + updateRealForBody(F, sub); + unrollClonedForBody(ClonedForBody, ClonedForCondPreheader, 0); + modifyClonedForBody(ClonedForBody); + unrollClonedForCondPreheader(ClonedForBody, ClonedForCondPreheader, + ForCondPreheader); + + modifyForBody(ClonedForCondPreheader, ForBody); + std::tuple NewSubAndTargetPHI3 = + modifyForBodyPreheader(ForBodyPreheader, ClonedForCondPreheader); + + Value *AddInst = expandForCondPreheader( + ForBody, ForCondPreheader, ClonedForCondPreheader, NewSubAndTargetPHI3); + + ClonedForBodyPreheader->moveBefore(ClonedForBody); + groupAndReorderInstructions(ClonedForBody); + return std::make_tuple(ClonedForCondPreheader, ForCondPreheader, AddInst); +} + +static bool moveIfEndToEnd(Function &F) { + + BasicBlock &lastBB = F.back(); + if (lastBB.getName() == "if.end") { + return false; + } + + BasicBlock *ifEndBB = getBasicBlockByName(F, "if.end"); + assert(ifEndBB && "Expected to find if.end!"); + if (ifEndBB) { + ifEndBB->removeFromParent(); + ifEndBB->insertInto(&F); + } + return true; +} + +static Value *modifyForCondPreheader(Function &F) { + LLVMContext &Ctx = F.getContext(); + + BasicBlock *forCondPreheader = getBasicBlockByName(F, "for.cond.preheader"); + BasicBlock *forBodyLrPh = getBasicBlockByName(F, "for.body.lr.ph"); + assert(forCondPreheader && "Expected to find for.cond.preheader!"); + assert(forBodyLrPh && "Expected to find for.body.lr.ph!"); + forCondPreheader->replaceAllUsesWith(forBodyLrPh); + forCondPreheader->eraseFromParent(); + forBodyLrPh->setName("for.cond.preheader"); + + unsigned int loadnum = 0; + for (auto I = forBodyLrPh->begin(); I != forBodyLrPh->end(); ++I) { + if (auto *loadinst = dyn_cast(&*I)) { + loadnum++; + if (loadnum == 2) { + IRBuilder<> Builder(loadinst->getNextNode()); + Value *NegSeven = ConstantInt::get(Type::getInt32Ty(Ctx), -7); + Value *Sub = Builder.CreateNSWAdd(loadinst, NegSeven, "sub"); + return Sub; // Return the newly inserted instruction + } + } + } + assert(false && "it must not be here"); +} + +static void modifyForCondPreheader2(BasicBlock *ClonedForBody, + BasicBlock *ClonedForCondPreheader, + BasicBlock *ForCondPreheader, + Value *andinst) { + + // Find phi instructions of float type in ClonedForBody + SmallVector PhiNodes; + for (Instruction &I : *ClonedForBody) { + if (PHINode *Phi = dyn_cast(&I)) { + PhiNodes.push_back(Phi); + } + } + + // Clone the found phi instructions to the beginning of ClonedForCondPreheader + // in order + Instruction *InsertPoint = &ForCondPreheader->front(); + PHINode *phi = cast(InsertPoint); + + BasicBlock *lastForCondPreheader = phi->getIncomingBlock(0); + SmallVector ClonedPhiNodes; + unsigned int floatphicount = 0; + for (PHINode *Phi : PhiNodes) { + PHINode *ClonedPhi = cast(Phi->clone()); + ClonedPhi->setName(Phi->getName() + ".clone"); + // Modify the operand 0 basicblock of each phi instruction to ForBody + if (Phi->getType()->isFloatTy()) { + if (floatphicount == 0) { + ClonedPhi->setIncomingValue(0, phi->getIncomingValue(0)); + floatphicount++; + } + } + ClonedPhi->setIncomingBlock(0, lastForCondPreheader); + ClonedPhi->insertAfter(InsertPoint); + // Update the insertion point to after the newly inserted PHI node + InsertPoint = ClonedPhi; + + ClonedPhiNodes.push_back(ClonedPhi); + } + + // Find operand 1 of the icmp instruction from ClonedForBody + ICmpInst *firstIcmp = getFirstICmpInst(ClonedForBody); + assert(firstIcmp && "Unable to find icmp instruction in ClonedForBody"); + Value *IcmpOperand1 = firstIcmp->getOperand(1); + + // Set operand 0 of icmp in ForCondPreheader to ClonedPhiNodes[0], and operand + // 1 to IcmpOperand1 + for (Instruction &I : *ForCondPreheader) { + if (ICmpInst *Icmp = dyn_cast(&I)) { + Icmp->setOperand(0, ClonedPhiNodes[0]); + Icmp->setOperand(1, IcmpOperand1); + Icmp->setName("cmp"); + break; + } + } + + ForCondPreheader->getTerminator()->setSuccessor(1, ClonedForCondPreheader); + + // // Delete redundant getelementptr, store and add instructions + SmallVector InstructionsToRemove; + for (Instruction &I : *ForCondPreheader) { + if (isa(&I) || isa(&I) || + isa(&I)) { + InstructionsToRemove.push_back(&I); + } + } + for (auto Inst = InstructionsToRemove.rbegin(); + Inst != InstructionsToRemove.rend(); ++Inst) { + if ((*Inst)->use_empty()) { + (*Inst)->eraseFromParent(); + } + } + // Find the icmp instruction in ClonedForCondPreheader + ICmpInst *IcmpInForCondPreheader = + getFirstICmpInstWithPredicate(ForCondPreheader, ICmpInst::ICMP_EQ); + + // Ensure that the icmp instruction is found + assert(IcmpInForCondPreheader && + "icmp instruction not found in ClonedForCondPreheader"); + + // Get the original operand 1 + Value *OriginalOperand1 = IcmpInForCondPreheader->getOperand(1); + + // If the original operand 1 is an instruction, get its operand 0 + if (Instruction *OriginalOperand1Inst = + dyn_cast(OriginalOperand1)) { + Value *NewOperand1 = OriginalOperand1Inst->getOperand(0); + + // Set the new operand 1 + IcmpInForCondPreheader->setOperand(1, NewOperand1); + // Change the original eq to slt + + IcmpInForCondPreheader->setPredicate(CmpInst::ICMP_SLT); + + } else { + assert(false && "The original operand 1 is not an instruction, cannot get " + "its operand 0\n"); + } + + // Find phi i32 node in ForCondPreheader with incoming 0 value == 0 + PHINode *TargetPhi = nullptr; + for (Instruction &I : *ForCondPreheader) { + if (PHINode *Phi = dyn_cast(&I)) { + if (isIncomingValueZeroOfPhi(Phi)) { + TargetPhi = Phi; + break; + } + } + } + + // Ensure the target phi node is found + assert(TargetPhi && "No matching phi i32 node found in ForCondPreheader"); + + TargetPhi->setIncomingValue(1, andinst); +} + +static Value *modifyClonedForBodyPreheader(BasicBlock *ClonedForBodyPreheader, + BasicBlock *ForBody) { + ICmpInst *firstIcmp = getFirstICmpInst(ForBody); + assert(firstIcmp && "Unable to find icmp instruction in ForBody"); + + Value *IcmpOperand1 = firstIcmp->getOperand(1); + + IRBuilder<> Builder(ClonedForBodyPreheader->getTerminator()); + Value *AndInst = + Builder.CreateAnd(IcmpOperand1, Builder.getInt32(2147483640)); + return AndInst; +} + +static void modifyClonedForCondPreheader(BasicBlock *ClonedForCondPreheader, + BasicBlock *ForBody, + BasicBlock *ForCondPreheader) { + + // Find float type phi node in ForBody + PHINode *FloatPhiInForBody = nullptr; + for (Instruction &I : *ForBody) { + if (PHINode *Phi = dyn_cast(&I)) { + if (Phi->getType()->isFloatTy()) { + FloatPhiInForBody = cast(I.clone()); + break; + } + } + } + + // Find and replace float type phi node in ClonedForCondPreheader + if (FloatPhiInForBody) { + PHINode *phi = getFirstFloatPhi(ClonedForCondPreheader); + assert(phi && "phi node not found"); + FloatPhiInForBody->insertBefore(phi); + phi->replaceAllUsesWith(FloatPhiInForBody); + phi->eraseFromParent(); + } + + // Set incomingblock 0 of FloatPhiInForBody to ForCondPreheader + if (FloatPhiInForBody) { + FloatPhiInForBody->setIncomingBlock(0, ForCondPreheader); + } + + // Find float type phi nodes in ForCondPreheader + SmallVector FloatPhisInForCondPreheader; + for (Instruction &I : *ForCondPreheader) { + if (PHINode *Phi = dyn_cast(&I)) { + if (Phi->getType()->isFloatTy()) { + FloatPhisInForCondPreheader.push_back(Phi); + } + } + } + + // Create 7 fadd instructions + Value *LastFAdd = nullptr; + if (FloatPhisInForCondPreheader.size() >= 8) { + IRBuilder<> Builder(FloatPhiInForBody->getNextNode()); + + Value *PrevAdd = getFirstFloatPhi(ClonedForCondPreheader); + + assert(PrevAdd && + "Unable to find float type PHI node in ClonedForCondPreheader"); + Value *Add139 = + Builder.CreateFAdd(PrevAdd, FloatPhisInForCondPreheader[2], "add139"); + Value *Add140 = + Builder.CreateFAdd(FloatPhisInForCondPreheader[3], + FloatPhisInForCondPreheader[4], "add140"); + Value *Add141 = + Builder.CreateFAdd(FloatPhisInForCondPreheader[5], + FloatPhisInForCondPreheader[6], "add141"); + Value *Add142 = + Builder.CreateFAdd(FloatPhisInForCondPreheader[7], + FloatPhisInForCondPreheader[8], "add142"); + Value *Add143 = Builder.CreateFAdd(Add139, Add140, "add143"); + Value *Add144 = Builder.CreateFAdd(Add141, Add142, "add144"); + Value *Add145 = Builder.CreateFAdd(Add143, Add144, "add145"); + LastFAdd = Add145; + } else { + llvm_unreachable("Unable to find float type PHI node in ForCondPreheader"); + } + + // Find store instruction in ForCondPreheader and update its operand + if (LastFAdd) { + for (auto &Inst : *ClonedForCondPreheader) { + if (auto *si = dyn_cast(&Inst)) { + si->setOperand(0, LastFAdd); + break; + } + } + } + + Value *addinst = nullptr; + // Iterate through instructions in ClonedForCondPreheader, looking for addnuw + // instruction + for (auto &Inst : *ClonedForCondPreheader) { + if (auto *AddInst = dyn_cast(&Inst)) { + if (AddInst->getOpcode() == Instruction::Add && + AddInst->hasNoUnsignedWrap()) { + addinst = AddInst; + break; + } + } + } + // Get the second successor of ClonedForCondPreheader + BasicBlock *SecondSuccessor = nullptr; + int SuccCount = 0; + for (auto *Succ : successors(ClonedForCondPreheader)) { + if (SuccCount == 1) { + SecondSuccessor = Succ; + break; + } + SuccCount++; + } + + if (SecondSuccessor && addinst) { + // Iterate through all PHI nodes in SecondSuccessor + int phiCount = 0; + for (PHINode &Phi : SecondSuccessor->phis()) { + if (phiCount == 1) { // Second phi node + // Set the second predecessor to ClonedForCondPreheader and its value to + // addinst + Phi.setIncomingBlock(1, ClonedForCondPreheader); + Phi.setIncomingValue(1, addinst); + } else { + // For other phi nodes, only update the predecessor basic block + Phi.setIncomingBlock(1, ClonedForCondPreheader); + } + phiCount++; + } + } +} + +static void modifyClonedForBody2(BasicBlock *ClonedForBody, + BasicBlock *ClonedForCondPreheader, + Value *AddInst, BasicBlock *ForCondPreheader) { + SmallVector floatPhiNodes; + + // Iterate through all instructions in ClonedForCondPreheader + for (Instruction &I : *ClonedForCondPreheader) { + if (PHINode *Phi = dyn_cast(&I)) { + if (Phi->getType()->isFloatTy()) { + floatPhiNodes.push_back(Phi); + if (floatPhiNodes.size() == 8) { + break; // Stop after finding 8 float type PHI nodes + } + } + } + } + + // Ensure we found 8 float type PHI nodes + assert(floatPhiNodes.size() == 8 && + "Unable to find 8 float type PHI nodes in ClonedForCondPreheader"); + + // Now floatPhiNodes contains 8 float type PHI nodes in order + + // Iterate through all PHI nodes in ClonedForBody + int phiIndex = 0; + for (PHINode &Phi : ClonedForBody->phis()) { + if (Phi.getType()->isFloatTy()) { + // Ensure we don't access floatPhiNodes out of bounds + if (phiIndex < floatPhiNodes.size()) { + // Set the 0th incoming value of the PHI node to the corresponding node + // in floatPhiNodes + if (phiIndex > + 0) { // Don't set the first phi node, as it's floatPhiInForBody + Phi.setIncomingValue(0, floatPhiNodes[phiIndex]); + } + phiIndex++; + } else { + // If the number of float type PHI nodes in ClonedForBody exceeds the + // size of floatPhiNodes, output a warning + assert(false && "Warning: Number of float type PHI nodes in " + "ClonedForBody exceeds expectations\n"); + break; + } + } + } + + // Ensure we processed all expected PHI nodes + if (phiIndex < floatPhiNodes.size()) { + assert(false && "Warning: Number of float type PHI nodes in ClonedForBody " + "is less than expected\n"); + } + + // Find the last icmp eq instruction in ClonedForBody + ICmpInst *lastIcmpEq = + getLastICmpInstWithPredicate(ClonedForBody, ICmpInst::ICMP_EQ); + + // Ensure we found the icmp eq instruction + assert(lastIcmpEq && "Unable to find icmp eq instruction in ClonedForBody"); + + // Set operand 1 to addInst + lastIcmpEq->setOperand(1, AddInst); + // Change the predicate of the icmp eq instruction to slt (signed less than) + lastIcmpEq->setPredicate(ICmpInst::ICMP_SLT); + // Change the name to cmp + lastIcmpEq->setName("cmp"); + + ClonedForBody->getTerminator()->setSuccessor(1, ForCondPreheader); + + // Find phi i32 node in ClonedForBody + PHINode *phiI32InClonedForBody = nullptr; + for (auto &Inst : *ClonedForBody) { + if (PHINode *Phi = dyn_cast(&Inst)) { + if (Phi->getType()->isIntegerTy(32)) { + phiI32InClonedForBody = Phi; + insertUnusedInstructionsBeforeIcmp(phiI32InClonedForBody, lastIcmpEq); + } + } + } + + // Ensure we found the phi i32 node + assert(phiI32InClonedForBody && + "Unable to find phi i32 node in ClonedForBody"); +} + +static std::pair findTwoI32PhiInBB(BasicBlock *ForBody) { + // Find the first i32 type PHI instruction in ForBody + PHINode *firstI32PhiInBB = nullptr; + PHINode *secondI32PhiInBB = nullptr; + int i32PhiCount2 = 0; + for (auto &Inst : *ForBody) { + if (PHINode *Phi = dyn_cast(&Inst)) { + if (Phi->getType()->isIntegerTy(32)) { + if (i32PhiCount2 == 0) { + firstI32PhiInBB = Phi; + i32PhiCount2++; + } else if (i32PhiCount2 == 1) { + secondI32PhiInBB = Phi; + break; + } + } + } + } + + // Ensure we found two i32 type PHI instructions in ForBody + assert(firstI32PhiInBB && secondI32PhiInBB && + "Unable to find two i32 type PHI instructions in BB"); + + return std::make_pair(firstI32PhiInBB, secondI32PhiInBB); +} +static void modifyForBody2(BasicBlock *ClonedForCondPreheader, + BasicBlock *ForBody, BasicBlock *ForCondPreheader) { + // Find the first i32 type PHI instruction in ForCondPreheader + auto [firstI32PhiInForCondPreheader, secondI32PhiInForCondPreheader] = + findTwoI32PhiInBB(ForCondPreheader); + + // Find the first i32 type PHI instruction in ForBody + auto [firstI32PhiInForBody, secondI32PhiInForBody] = + findTwoI32PhiInBB(ForBody); + + // Set the incoming 0 value of the two i32 type PHI instructions found in + // ForBody to the firstI32Phi found in ForCondPreheader + firstI32PhiInForBody->setIncomingValue(0, firstI32PhiInForCondPreheader); + secondI32PhiInForBody->setIncomingValue(0, secondI32PhiInForCondPreheader); + + ForBody->getTerminator()->setSuccessor(0, ClonedForCondPreheader); + + // Find the first float type PHI instruction in ForCondPreheader + PHINode *SecondFloatPhiInForCondPreheader = nullptr; + int floatPhiCount = 0; + for (auto &Inst : *ForCondPreheader) { + if (PHINode *Phi = dyn_cast(&Inst)) { + if (Phi->getType()->isFloatTy()) { + floatPhiCount++; + if (floatPhiCount == 2) { + SecondFloatPhiInForCondPreheader = Phi; + break; + } + } + } + } + + // Ensure we found a float type PHI instruction in ForCondPreheader + assert(SecondFloatPhiInForCondPreheader && + "Unable to find float type PHI instruction in ForCondPreheader"); + + // Find the only float type PHI instruction in ForBody + PHINode *FloatPhiInForBody = getFirstFloatPhi(ForBody); + assert(FloatPhiInForBody && "Unable to find float type PHI instruction in " + "ForBody"); + + // Set incoming value 0 of the float type PHI instruction in ForBody + FloatPhiInForBody->setIncomingValue(0, SecondFloatPhiInForCondPreheader); + + // Find the unique float type PHI instruction in ClonedForCondPreheader + PHINode *FloatPhiInClonedForCondPreheader = + getFirstFloatPhi(ClonedForCondPreheader); + assert(FloatPhiInClonedForCondPreheader && + "Float type PHI instruction not found in ClonedForCondPreheader"); + + // Set incoming value 0 of the float type PHI instruction in + // ClonedForCondPreheader + FloatPhiInClonedForCondPreheader->setIncomingValue( + 0, SecondFloatPhiInForCondPreheader); +} + +// Helper function to run dead code elimination +static void runDeadCodeElimination(Function &F) { + legacy::FunctionPassManager FPM(F.getParent()); + FPM.add(createDeadCodeEliminationPass()); + FPM.run(F); + LLVM_DEBUG(F.dump()); +} + +static bool modifySecondForBody(Loop *L, Function &F, BasicBlock *ForBody, + BasicBlock *FirstClonedForCondPreheader, + BasicBlock *FirstForCondPreheader, + Value *AddInst) { + BasicBlock *ForBodyPreheader = L->getLoopPreheader(); + + // Find the 0th successor of ForBody, it should have two + BasicBlock *ForCondPreheader = getFirstSuccessorOfForBody(ForBody); + + std::tuple ClonedBBs = + cloneThreeBB(ForBodyPreheader, ForBody, ForCondPreheader, F); + BasicBlock *ClonedForBodyPreheader = std::get<0>(ClonedBBs); + BasicBlock *ClonedForBody = std::get<1>(ClonedBBs); + BasicBlock *ClonedForCondPreheader = std::get<2>(ClonedBBs); + + ClonedForCondPreheader->setName("for.end"); + ClonedForBody->moveBefore(ForBody); + ClonedForBodyPreheader->moveBefore(ClonedForBody); + ForCondPreheader->moveBefore(ClonedForBodyPreheader); + ClonedForCondPreheader->moveAfter(ForBody); + ForCondPreheader->getTerminator()->setSuccessor(0, ForBodyPreheader); + + unrollClonedForBody(ClonedForBody, ClonedForCondPreheader, 1); + modifyClonedForBody2(ClonedForBody, FirstClonedForCondPreheader, AddInst, + ForCondPreheader); + + Value *andinst = + modifyClonedForBodyPreheader(ClonedForBodyPreheader, ForBody); + modifyForCondPreheader2(ClonedForBody, ClonedForCondPreheader, + ForCondPreheader, andinst); + modifyClonedForCondPreheader(ClonedForCondPreheader, ForBody, + ForCondPreheader); + modifyForBody2(ClonedForCondPreheader, ForBody, ForCondPreheader); + + FirstForCondPreheader->getTerminator()->setSuccessor(0, + ClonedForBodyPreheader); + + // Run Dead Code Elimination optimization + runDeadCodeElimination(F); + + groupAndReorderInstructions(ClonedForBody); + + return true; +} +static void insertDoublePreheader(Function &F) { + BasicBlock *entry = &F.getEntryBlock(); + BasicBlock *ifend = &F.back(); + BasicBlock *entry_successor1 = entry->getTerminator()->getSuccessor(1); + + // Create a new basic block + BasicBlock *newBB = BasicBlock::Create( + F.getContext(), entry_successor1->getName() + ".preheader", &F, + entry_successor1); + + Value *len = getLenFromEntryBlock(F); + + // Insert instructions in the new basic block + IRBuilder<> builder(newBB); + Value *cmp151349 = builder.CreateICmpSGT( + len, ConstantInt::get(len->getType(), 0), "cmp151349"); + + // Create a conditional branch + builder.CreateCondBr(cmp151349, entry_successor1, ifend); + + // Modify the terminator of entry to jump to the new basic block + entry->getTerminator()->setSuccessor(1, newBB); +} +static bool unrollFir(Function &F, Loop *L) { + + bool Changed = false; + static BasicBlock *FirstClonedForCondPreheader = nullptr; + static BasicBlock *FirstForCondPreheader = nullptr; + static Value *AddInst = nullptr; + + for (auto *BB : L->blocks()) { + + assert(BB->getName().contains("for.body") && "BB must is for.body"); + Changed = moveIfEndToEnd(F); + // Temporarily skip processing the second loop + + if (Changed) { + insertDoublePreheader(F); + Value *sub = modifyForCondPreheader(F); + std::tuple result = + modifyFirstForBody(L, F, BB, sub); + FirstClonedForCondPreheader = std::get<0>(result); + FirstForCondPreheader = std::get<1>(result); + AddInst = std::get<2>(result); + } else { + modifySecondForBody(L, F, BB, FirstClonedForCondPreheader, + FirstForCondPreheader, AddInst); + } + } + LLVM_DEBUG(F.dump()); + + return Changed; +} + +// Preprocessing function +static PHINode *preprocessClonedForBody(BasicBlock *ClonedForBody) { + // Find the unique PHI node + PHINode *phiNode = nullptr; + for (auto &I : *ClonedForBody) { + if (auto *phi = dyn_cast(&I)) { + phiNode = phi; + break; + } + } + + // Ensure that the PHI node is found + assert(phiNode && "PHI node not found"); + + // Find two mul nsw instructions + SmallVector mulInsts; + for (auto &I : *ClonedForBody) { + if (auto *binOp = dyn_cast(&I)) { + if (binOp->getOpcode() == Instruction::Mul && binOp->hasNoSignedWrap()) { + mulInsts.push_back(binOp); + } + } + } + + // Replace mul nsw instructions with the PHI node + for (auto *mulInst : mulInsts) { + mulInst->replaceAllUsesWith(phiNode); + mulInst->eraseFromParent(); + } + return phiNode; +} + +static Instruction *modifyAddToOrInClonedForBody(BasicBlock *ClonedForBody) { + // Find the unique add nuw nsw instruction + Instruction *addInst = nullptr; + for (auto &I : *ClonedForBody) { + if (auto *binOp = dyn_cast(&I)) { + if (binOp->getOpcode() == Instruction::Add && + binOp->hasNoUnsignedWrap()) { + addInst = binOp; + break; + } + } + } + + // Ensure that the add nuw nsw instruction is found + assert(addInst && "add nuw nsw instruction not found"); + + // Create a new or disjoint instruction + Instruction *orInst = BinaryOperator::CreateDisjoint( + Instruction::Or, addInst->getOperand(0), + ConstantInt::get(addInst->getType(), 1), "add", addInst); + + // Replace all uses of the add instruction + addInst->replaceAllUsesWith(orInst); + + // Delete the original add instruction + addInst->eraseFromParent(); + orInst->setName("add"); + return orInst; +} + +static void modifyAddToOr(BasicBlock *ClonedForBody) { + SmallVector addInsts; + + // Collect all add instructions that meet the criteria + for (auto &I : *ClonedForBody) { + if (auto *binOp = dyn_cast(&I)) { + if (binOp->getOpcode() == Instruction::Add) { + addInsts.push_back(binOp); + } + } + } + if (addInsts.empty()) { + return; + } + // Replace each add instruction with an or disjoint instruction + for (auto it = addInsts.begin(); it != std::prev(addInsts.end()); ++it) { + auto *addInst = *it; + // Create a new or disjoint instruction + Instruction *orInst = + BinaryOperator::CreateDisjoint(Instruction::Or, addInst->getOperand(0), + addInst->getOperand(1), "add", addInst); + + // Replace all uses of the add instruction + addInst->replaceAllUsesWith(orInst); + + // Delete the original add instruction + addInst->eraseFromParent(); + orInst->setName("add"); + } +} + +static Value *unrolladdcClonedForBody(BasicBlock *ClonedForBody, + int unroll_factor) { + + // Call the preprocessing function + PHINode *phiNode = preprocessClonedForBody(ClonedForBody); + + // Replace add instructions with or instructions + Instruction *orInst = modifyAddToOrInClonedForBody(ClonedForBody); + + // Find the first non-PHI instruction and or instruction + Instruction *firstNonPHI = ClonedForBody->getFirstNonPHI(); + + // Ensure that the start and end instructions are found + assert(firstNonPHI && orInst && "Start or end instruction not found"); + + // Find the icmp instruction + Instruction *icmpInst = getFirstICmpInst(ClonedForBody); + + // Ensure that the icmp instruction is found + assert(icmpInst && "icmp instruction not found"); + + // Print information about the icmp instruction + + Instruction *newOrInst = orInst; + // Copy instructions 15 times + for (int i = 1; i <= (unroll_factor - 1); i++) { + ValueToValueMapTy VMap; + for (auto it = firstNonPHI->getIterator(); &*it != orInst; ++it) { + Instruction *newInst = it->clone(); + // For getelementptr instructions, set the second operand to orInst + if (GetElementPtrInst *GEP = dyn_cast(newInst)) { + newInst->setOperand(1, newOrInst); + newInst->setName("arrayidx"); + } + // If it's a fadd instruction, change its name to add + if (newInst->getOpcode() == Instruction::FAdd) { + newInst->setName("add"); + } + VMap[&*it] = newInst; + newInst->insertBefore(icmpInst); + } + + // Update operands of new instructions + for (auto it = firstNonPHI->getIterator(); &*it != orInst; ++it) { + Instruction *newInst = cast(VMap[&*it]); + for (unsigned j = 0; j < newInst->getNumOperands(); j++) { + Value *op = newInst->getOperand(j); + if (VMap.count(op)) { + newInst->setOperand(j, VMap[op]); + } + } + } + // Clone orInst and insert before icmpInst + newOrInst = orInst->clone(); + // Set the second operand of newOrInst to i+1 + newOrInst->setOperand(1, ConstantInt::get(newOrInst->getType(), i + 1)); + newOrInst->setName("add"); + newOrInst->insertBefore(icmpInst); + VMap[orInst] = newOrInst; + } + + // Replace or instruction with add nuw nsw instruction + IRBuilder<> Builder(newOrInst); + Value *newAddInst = + Builder.CreateNUWAdd(newOrInst->getOperand(0), newOrInst->getOperand(1)); + newOrInst->replaceAllUsesWith(newAddInst); + newOrInst->eraseFromParent(); + + // Create a new add instruction, subtracting 16 from len + Builder.SetInsertPoint(icmpInst); + Value *len = icmpInst->getOperand(1); + Value *sub = Builder.CreateNSWAdd( + len, ConstantInt::get(len->getType(), -unroll_factor), "sub"); + // Set the icmp instruction's predicate to sgt, and operands to newAddInst + if (ICmpInst *icmp = dyn_cast(icmpInst)) { + icmp->setPredicate(ICmpInst::ICMP_SGT); + icmp->setOperand(0, newAddInst); + icmp->setOperand(1, sub); + } + + phiNode->setIncomingValue(0, newAddInst); + return sub; +} + +static void expandForCondPreheaderaddc(Function &F, + BasicBlock *ForCondPreheader, + BasicBlock *ClonedForBody, + BasicBlock *ForBody, Value *sub, + int unroll_factor) { + // Create a new ForCondPreheader after the original ForCondPreheader + BasicBlock *NewForCondPreheader = BasicBlock::Create( + ForCondPreheader->getContext(), "for.cond.preheader.new", + ForCondPreheader->getParent(), ForCondPreheader->getNextNode()); + // Create a new empty BasicBlock after NewForCondPreheader + BasicBlock *NewForCondPreheader2 = BasicBlock::Create( + NewForCondPreheader->getContext(), "for.cond.preheader.new2", + NewForCondPreheader->getParent(), NewForCondPreheader->getNextNode()); + + // Move sub to the new ForCondPreheader + if (Instruction *SubInst = dyn_cast(sub)) { + SubInst->removeFromParent(); + SubInst->insertInto(NewForCondPreheader, NewForCondPreheader->begin()); + } + + // Create new comparison instruction in NewForCondPreheader + IRBuilder<> Builder(NewForCondPreheader); + Value *len = getLenFromEntryBlock(F); + + assert(len && "Parameter named 'len' not found"); + + Value *cmp6not207 = Builder.CreateICmpULT( + len, ConstantInt::get(len->getType(), unroll_factor), "cmp6.not207"); + + // Create conditional branch instruction + Builder.CreateCondBr(cmp6not207, NewForCondPreheader2, ClonedForBody); + + // Find if.end basic block + BasicBlock *ifEndBB = getBasicBlockByName(F, "if.end"); + BasicBlock *returnBB = getBasicBlockByName(F, "return"); + assert(ifEndBB && "Expected to find if.end!"); + assert(returnBB && "Expected to find return!"); + // Get the terminator instruction of if.end + Instruction *terminator = ifEndBB->getTerminator(); + if (!terminator) { + assert(false && "if.end basic block has no terminator instruction\n"); + return; + } + + // Replace the first operand of the terminator instruction with + // NewForCondPreheader + terminator->setOperand(2, NewForCondPreheader); + + // Find the unique PHINode in clonedForBody + PHINode *uniquePHI = nullptr; + for (Instruction &I : *ClonedForBody) { + if (auto *phi = dyn_cast(&I)) { + if (uniquePHI) { + // If we've already found a PHINode but find another, it's not unique + + uniquePHI = nullptr; + break; + } + uniquePHI = phi; + } + } + + assert(uniquePHI && "No unique PHINode found in ForBody\n"); + + uniquePHI->setIncomingBlock(1, NewForCondPreheader); + auto *clonedphi = uniquePHI->clone(); + clonedphi->insertInto(NewForCondPreheader2, NewForCondPreheader2->begin()); + + // Create comparison instruction + ICmpInst *cmp85209 = + new ICmpInst(ICmpInst::ICMP_SLT, clonedphi, len, "cmp85209"); + cmp85209->insertAfter(clonedphi); + + // Create conditional branch instruction + BranchInst *br = BranchInst::Create(ForBody, returnBB, cmp85209); + + br->insertAfter(cmp85209); + + // Get the terminator instruction of ClonedForBody + BranchInst *clonedTerminator = + dyn_cast(ClonedForBody->getTerminator()); + assert(clonedTerminator && + "ClonedForBody's terminator should be a BranchInst"); + if (!clonedTerminator) { + assert(false && "ClonedForBody has no terminator instruction\n"); + return; + } + + // Set the first operand of ClonedForBody's terminator to NewForCondPreheader2 + clonedTerminator->setOperand(2, NewForCondPreheader2); + + // Find the unique PHI node in ForBody + PHINode *uniquePHI2 = nullptr; + for (Instruction &I : *ForBody) { + if (auto *phi = dyn_cast(&I)) { + if (uniquePHI2) { + // If we've already found a PHINode but find another, it's not unique + + uniquePHI = nullptr; + break; + } + uniquePHI2 = phi; + } + } + + assert(uniquePHI2 && "No unique PHINode found in ForBody\n"); + + uniquePHI2->setIncomingValue(1, clonedphi); + uniquePHI2->setIncomingBlock(1, NewForCondPreheader2); + + // Find the unique PHI node in returnBB + PHINode *returnBBPHI = nullptr; + for (Instruction &I : *returnBB) { + if (auto *phi = dyn_cast(&I)) { + if (returnBBPHI) { + // If we've already found a PHINode but find another, it's not unique + returnBBPHI = nullptr; + break; + } + returnBBPHI = phi; + } + } + + if (returnBBPHI) { + // Add [0, NewForCondPreheader2] + returnBBPHI->addIncoming(ConstantInt::get(returnBBPHI->getType(), 0), + NewForCondPreheader2); + } else { + assert(false && "No unique PHI node found in returnBB\n"); + } +} + +static void addnoalias(Function &F) { + for (Argument &Arg : F.args()) { + if (Arg.getType()->isPointerTy()) { + Arg.addAttr(Attribute::NoAlias); + } + } +} +static BasicBlock *cloneForBody(Function &F, BasicBlock *ForBody, + const std::string &Suffix) { + ValueToValueMapTy VMap; + BasicBlock *ClonedForBody = CloneBasicBlock(ForBody, VMap, Suffix, &F); + VMap[ForBody] = ClonedForBody; + remapInstructionsInBlocks({ClonedForBody}, VMap); + return ClonedForBody; +} + +static void unrollAddc(Function &F, ScalarEvolution &SE, Loop *L, + int unroll_factor) { + + // Get the basic block containing the function body from L + BasicBlock *ForBody = L->getHeader(); + + // Ensure that the basic block containing the function body is found + if (!ForBody) { + assert(ForBody && "ForBody not found"); + return; + } + + // clone for body + + BasicBlock *ClonedForBody = cloneForBody(F, ForBody, ".modify"); + ClonedForBody->moveBefore(ForBody); + + Value *sub = unrolladdcClonedForBody(ClonedForBody, unroll_factor); + + // Find the ForCondPreheader basic block from F + BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader"); + assert(ForCondPreheader && "Expected to find for.cond.preheader!"); + expandForCondPreheaderaddc(F, ForCondPreheader, ClonedForBody, ForBody, sub, + unroll_factor); + modifyAddToOr(ClonedForBody); + groupAndReorderInstructions(ClonedForBody); + + // Verify the function + if (verifyFunction(F, &errs())) { + LLVM_DEBUG(errs() << "Function verification failed\n"); + return; + } +} + +static void unrollCorr(Function &F, Loop *L, int unroll_factor) { + + // Get the basic block containing the function body from L + BasicBlock *ForBody = L->getHeader(); + assert(ForBody && "ForBody not found"); + + // clone for body + BasicBlock *ClonedForBody = cloneForBody(F, ForBody, ".unroll"); + + BasicBlock *returnBB = getBasicBlockByName(F, "return"); + assert(returnBB && "Expected to find return!"); + BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader"); + assert(ForCondPreheader && "Expected to find for.cond.preheader!"); + BasicBlock *ForCond11PreheaderUs = L->getLoopPreheader(); + assert(ForCond11PreheaderUs && "Expected to find for.cond.preheader!"); + + ClonedForBody->moveBefore(returnBB); + + ForCondPreheader->setName("if.end"); + + // Find the first instruction in ForCondPreheader + Instruction *FirstInst = &*ForCondPreheader->begin(); + Instruction *SecondInst = FirstInst->getNextNode(); + // Ensure the first instruction is a sub nsw instruction + if (BinaryOperator *SubInst = dyn_cast(FirstInst)) { + if (SubInst->getOpcode() == Instruction::Sub && + SubInst->hasNoSignedWrap()) { + ; + } else { + assert(false && "The first instruction in ForCondPreheader is not a sub " + "nsw instruction\n"); + } + } else { + assert(false && "The first instruction in ForCondPreheader is not a binary " + "operation\n"); + } + // Insert new instruction after FirstInst + IRBuilder<> Builder(FirstInst->getNextNode()); + Value *Sub6 = Builder.CreateNSWAdd( + FirstInst, ConstantInt::get(FirstInst->getType(), 1 - unroll_factor), + "sub6"); + + if (ICmpInst *CmpInst = dyn_cast(SecondInst)) { + if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) { + CmpInst->setOperand(0, FirstInst); + CmpInst->setOperand( + 1, ConstantInt::get(FirstInst->getType(), unroll_factor - 1)); + CmpInst->setPredicate(ICmpInst::ICMP_SGT); + } + } + // Create new basic blocks + BasicBlock *ForCond11PreheaderPreheader = ForCondPreheader->getNextNode(); + BasicBlock *ForCond8PreheaderLrPh = + BasicBlock::Create(F.getContext(), "for.cond8.preheader.lr.ph", &F, + ForCond11PreheaderPreheader); + BasicBlock *ForCond8Preheader = BasicBlock::Create( + F.getContext(), "for.cond8.preheader", &F, ForCond11PreheaderPreheader); + BasicBlock *ForBody10LrPh = BasicBlock::Create( + F.getContext(), "for.body10.lr.ph", &F, ForCond11PreheaderPreheader); + BasicBlock *ForCond91Preheader = BasicBlock::Create( + F.getContext(), "for.cond91.preheader", &F, ForCond11PreheaderPreheader); + BasicBlock *ForCond95PreheaderLrPh = + BasicBlock::Create(F.getContext(), "for.cond95.preheader.lr.ph", &F, + ForCond11PreheaderPreheader); + + // Set predecessors for the basic blocks + ForCondPreheader->getTerminator()->setSuccessor(0, ForCond8PreheaderLrPh); + ForCondPreheader->getTerminator()->setSuccessor(1, ForCond91Preheader); + + // Find the parameter named patlen from the function arguments + Value *PatlenArg = F.getArg(3); + Value *SignalArg = F.getArg(0); + assert(PatlenArg && "Parameter named patlen not found\n"); + assert(SignalArg && "Parameter named signal not found\n"); + + // Add instructions to the for.cond8.preheader.lr.ph basic block + Builder.SetInsertPoint(ForCond8PreheaderLrPh); + Value *Cmp9242 = Builder.CreateICmpSGT( + PatlenArg, ConstantInt::get(PatlenArg->getType(), 0), "cmp9242"); + Builder.CreateBr(ForCond8Preheader); + + // Add instructions to the for.cond8.preheader basic block + Builder.SetInsertPoint(ForCond8Preheader); + PHINode *N0276 = + Builder.CreatePHI(Type::getInt32Ty(F.getContext()), 2, "n.0276"); + N0276->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), + ForCond8PreheaderLrPh); + + // Create conditional branch instruction + Builder.CreateCondBr(Cmp9242, ForBody10LrPh, nullptr); + + // Add instructions to the for.body10.lr.ph basic block + Builder.SetInsertPoint(ForBody10LrPh); + + // Create getelementptr instruction + Value *GEP = + Builder.CreateGEP(Type::getFloatTy(F.getContext()), SignalArg, N0276, ""); + + // Create unconditional branch instruction to ClonedForBody + Builder.CreateBr(ClonedForBody); + + // Add instructions to the for.cond91.preheader basic block + Builder.SetInsertPoint(ForCond91Preheader); + + // Create PHI node + PHINode *N0Lcssa = + Builder.CreatePHI(Type::getInt32Ty(F.getContext()), 2, "n.0.lcssa"); + N0Lcssa->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), + ForCondPreheader); + // Note: [ %add89, %for.cond.cleanup ] part not added yet + + // Create comparison instruction + Value *Cmp92Not282 = + Builder.CreateICmpSGT(N0Lcssa, FirstInst, "cmp92.not282"); + + // Create conditional branch instruction + Builder.CreateCondBr(Cmp92Not282, returnBB, ForCond95PreheaderLrPh); + + // Add instructions to the for.cond95.preheader.lr.ph basic block + Builder.SetInsertPoint(ForCond95PreheaderLrPh); + + Value *Cmp92678 = Builder.CreateICmpSGT( + PatlenArg, ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), + "Cmp92678"); + // Insert Cmp92678 + Builder.CreateCondBr(Cmp92678, ForCond11PreheaderUs, + ForCond11PreheaderPreheader); + + Builder.SetInsertPoint(ForCond11PreheaderPreheader, + ForCond11PreheaderPreheader->begin()); + + Instruction *ForCond11PreheaderPreheaderterminater = + ForCond11PreheaderPreheader->getTerminator(); + Instruction *ForCond11PreheaderPreheaderFirstInst = + &*ForCond11PreheaderPreheader->begin(); + Value *SiglenArg = ForCond11PreheaderPreheaderFirstInst->getOperand(0); + // Calculate the result of n.0.lcssa left shifted by 2 bits + Value *ShiftedN = Builder.CreateShl( + N0Lcssa, ConstantInt::get(Type::getInt32Ty(F.getContext()), 2), ""); + + // Create getelementptr instruction + // Find memset function call + CallInst *MemsetCall = getFirstCallInstWithName(ForCond11PreheaderPreheader, + "llvm.memset.p0.i32"); + + // Ensure memset call is found + assert(MemsetCall && "memset call not found"); + + // Get DestArg + Value *DestArg = MemsetCall->getArgOperand(0); + + // Create new GEP instruction + Value *Scevgep = Builder.CreateGEP(Type::getInt8Ty(F.getContext()), DestArg, + ShiftedN, "scevgep"); + MemsetCall->setOperand(0, Scevgep); + // Calculate siglen + 1 + Value *SiglenPlus1 = Builder.CreateAdd( + SiglenArg, ConstantInt::get(Type::getInt32Ty(F.getContext()), 1), ""); + + // Calculate n.0.lcssa + patlen + Value *NplusPatlen = Builder.CreateAdd(N0Lcssa, PatlenArg, ""); + + // Calculate (siglen + 1) - (n.0.lcssa + patlen) + Value *SubResult = Builder.CreateSub(SiglenPlus1, NplusPatlen, ""); + + // Calculate the final memset length + Value *MemsetLen = Builder.CreateShl( + SubResult, ConstantInt::get(Type::getInt32Ty(F.getContext()), 2), ""); + Instruction *addinst = dyn_cast(MemsetCall->getOperand(2)); + MemsetCall->setOperand(2, MemsetLen); + if (addinst && addinst->use_empty()) + addinst->eraseFromParent(); + if (ForCond11PreheaderPreheaderFirstInst->use_empty()) + ForCond11PreheaderPreheaderFirstInst->eraseFromParent(); + + // Create a Preheader for ForCond11PreheaderUs + BasicBlock *ForCond11PreheaderUsPreheader = + BasicBlock::Create(F.getContext(), "for.cond11.preheader.us.preheader", + &F, ForCond11PreheaderUs); + + // Add an unconditional branch to ForCond11PreheaderUs in the new Preheader + BranchInst::Create(ForCond11PreheaderUs, ForCond11PreheaderUsPreheader); + + // Insert new instructions in ForCond11PreheaderUsPreheader + Builder.SetInsertPoint(ForCond11PreheaderUsPreheader->getTerminator()); + + // Add %6 = add i32 %siglen, 1 + Value *SiglenPlus2 = Builder.CreateAdd( + SiglenArg, ConstantInt::get(Type::getInt32Ty(F.getContext()), 1), ""); + + // Add %7 = sub i32 %6, %patlen + Value *SubResult2 = Builder.CreateSub(SiglenPlus2, PatlenArg, ""); + + // Find PHI node + PHINode *PhiNode = nullptr; + for (PHINode &Phi : ForCond11PreheaderUs->phis()) { + PhiNode = Φ + break; + } + + assert(PhiNode && "PHI node not found in for.cond11.preheader.us\n"); + + // Modify incoming values of the PHI node + PhiNode->setIncomingBlock(1, ForCond11PreheaderUsPreheader); + PhiNode->setIncomingValue(1, N0Lcssa); + + BasicBlock *ForCond11ForCondCleanup13CritEdgeUs = ForBody->getNextNode(); + // Find icmp ult instruction in ForCond11ForCondCleanup13CritEdgeUs + ICmpInst *IcmpUltInst = getLastICmpInstWithPredicate( + ForCond11ForCondCleanup13CritEdgeUs, ICmpInst::ICMP_ULT); + + assert(IcmpUltInst && "icmp ult instruction not found in " + "ForCond11ForCondCleanup13CritEdgeUs\n"); + + IcmpUltInst->setOperand(0, PhiNode->getIncomingValue(0)); + IcmpUltInst->setOperand(1, SubResult2); + IcmpUltInst->setPredicate(ICmpInst::ICMP_EQ); + + swapTerminatorSuccessors(ForCond11ForCondCleanup13CritEdgeUs); + + // Find PHI nodes in ClonedForBody + for (PHINode &Phi : ClonedForBody->phis()) { + Phi.setIncomingBlock(0, ForBody10LrPh); + } + + // Find phi float instruction in ClonedForBody + PHINode *FloatPhi = getFirstFloatPhi(ClonedForBody); + assert(FloatPhi && "phi float node not found"); + // Find getelementptr inbounds instructions in ClonedForBody + GetElementPtrInst *GEPInst = nullptr; + GetElementPtrInst *GEPInst2 = nullptr; + for (auto &I : *ClonedForBody) { + if (auto *GEP = dyn_cast(&I)) { + if (GEP->isInBounds()) { + GEPInst = GEP; + } else { + GEPInst2 = GEP; + } + } + } + assert(GEPInst && + "getelementptr inbounds instruction not found in ClonedForBody\n"); + assert(GEPInst2 && + "getelementptr inbounds instruction not found in ClonedForBody\n"); + + GEPInst2->setOperand(0, GEP); + + Instruction *loadinst = GEPInst->getNextNode(); + GEPInst->moveBefore(FloatPhi); + loadinst->moveBefore(FloatPhi); + + if (FloatPhi) { + // Find the llvm.fmuladd.f32 instruction + Instruction *FMulAdd = + getFirstCallInstWithName(ClonedForBody, "llvm.fmuladd.f32"); + assert(FMulAdd && "llvm.fmuladd.f32 instruction not found\n"); + Instruction *InsertPoint = FMulAdd->getNextNode(); + if (FMulAdd) { + // Copy instructions unroll_factor-1 times + for (int i = 0; i < (unroll_factor - 1); ++i) { + ValueToValueMapTy VMap; + for (auto It = FloatPhi->getIterator(); &*It != FMulAdd->getNextNode(); + ++It) { + Instruction *NewInst = It->clone(); + VMap[&*It] = NewInst; + NewInst->insertBefore(InsertPoint); + } + + // Update operands of new instructions + for (auto It = FloatPhi->getIterator(); &*It != FMulAdd->getNextNode(); + ++It) { + Instruction *NewInst = cast(VMap[&*It]); + for (unsigned j = 0; j < NewInst->getNumOperands(); j++) { + Value *Op = NewInst->getOperand(j); + if (VMap.count(Op)) { + NewInst->setOperand(j, VMap[Op]); + } + } + // If NewInst is a getelementptr instruction, set its operand 1 to i+1 + if (GetElementPtrInst *GEP = dyn_cast(NewInst)) { + GEP->setOperand(0, GEPInst); + GEP->setOperand( + 1, ConstantInt::get(GEP->getOperand(1)->getType(), i + 1)); + GEP->setName("arrayidx" + std::to_string(i + 1)); + } + } + } + + } else { + assert(false && "llvm.fmuladd.f32 instruction not found\n"); + } + } else { + assert(false && "phi float instruction not found\n"); + } + movePHINodesToTop(*ClonedForBody); + groupAndReorderInstructions(ClonedForBody); + + // Create new basic block for.cond.cleanup + BasicBlock *ForCondCleanup = + BasicBlock::Create(F.getContext(), "for.cond.cleanup", &F, ClonedForBody); + + ForCond8Preheader->getTerminator()->setSuccessor(1, ForCondCleanup); + // Create unconditional branch to ClonedForBody in for.cond.cleanup + BranchInst::Create(ClonedForBody, ForCondCleanup); + + // Get the terminator instruction of ClonedForBody + Instruction *Terminator = ClonedForBody->getTerminator(); + + // Set the first successor of ClonedForBody to for.cond.cleanup + if (Terminator->getNumSuccessors() > 0) { + Terminator->setSuccessor(0, ForCondCleanup); + } + + // Clone phi float nodes from ClonedForBody to ForCondCleanup + int i = 0; + for (PHINode &Phi : ClonedForBody->phis()) { + if (Phi.getType()->isFloatTy()) { + Instruction *newPhi = Phi.clone(); + cast(newPhi)->setIncomingBlock(0, ForCond8Preheader); + newPhi->insertBefore(ForCondCleanup->getTerminator()); + if (i == 0) { + GetElementPtrInst *arrayidx = GetElementPtrInst::Create( + Type::getFloatTy(F.getContext()), DestArg, N0276, "arrayidx", + ForCondCleanup->getTerminator()); + StoreInst *storeInst = + new StoreInst(newPhi, arrayidx, ForCondCleanup->getTerminator()); + } else { + Instruction *orInst = BinaryOperator::CreateDisjoint( + Instruction::Or, N0276, ConstantInt::get(N0276->getType(), i), + "add"); + orInst->insertBefore(ForCondCleanup->getTerminator()); + GetElementPtrInst *arrayidx = GetElementPtrInst::Create( + Type::getFloatTy(F.getContext()), DestArg, orInst, "arrayidx", + ForCondCleanup->getTerminator()); + + StoreInst *storeInst = + new StoreInst(newPhi, arrayidx, ForCondCleanup->getTerminator()); + } + i++; + } + } + + // Insert new instructions at the end of ClonedForBody + Builder.SetInsertPoint(ForCondCleanup->getTerminator()); + Value *add89 = Builder.CreateAdd( + N0276, ConstantInt::get(N0276->getType(), unroll_factor), "add89", true, + true); + Value *cmp7 = Builder.CreateICmpSLT(add89, Sub6, "cmp7"); + + // Get the original terminator instruction + Instruction *OldTerminator = ForCondCleanup->getTerminator(); + + // Create new conditional branch instruction + BranchInst *NewBr = + BranchInst::Create(ForCond8Preheader, ForCond91Preheader, cmp7); + + // Insert new branch instruction and delete the old terminator + ReplaceInstWithInst(OldTerminator, NewBr); + + movePHINodesToTop(*ForCondCleanup); + groupAndReorderInstructions(ForCondCleanup); + + // Update PHI nodes in for.cond8.preheader + for (PHINode &Phi : ForCond8Preheader->phis()) { + Phi.addIncoming(add89, ForCondCleanup); + } + + // Update PHI nodes in for.cond91.preheader + for (PHINode &Phi : ForCond91Preheader->phis()) { + Phi.addIncoming(add89, ForCondCleanup); + } + + // Iterate through all PHI nodes in returnBB + for (PHINode &Phi : returnBB->phis()) { + // Add new incoming value for each PHI node + Phi.addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), + ForCond91Preheader); + } + // for.cond95.preheader.lr.ph -> for.cond11.preheader.us.preheader + ForCond95PreheaderLrPh->getTerminator()->setSuccessor( + 0, ForCond11PreheaderUsPreheader); +} + +static bool checkIfDotProdSimplest(Function &F) { + bool flag = false; + + if (F.size() == 3) { + BasicBlock *entryBB = getBasicBlockByName(F, "entry"); + BasicBlock *forCondCleanup = getBasicBlockByName(F, "for.cond.cleanup"); + BasicBlock *forBody = getBasicBlockByName(F, "for.body"); + if (entryBB && forCondCleanup && forBody) { + CallInst *fmuladd = getFirstCallInstWithName(forBody, "llvm.fmuladd.f32"); + if (fmuladd) { + if (forBody->getTerminator()->getSuccessor(0) == forCondCleanup && + forBody->getTerminator()->getSuccessor(1) == forBody) { + if (entryBB->getTerminator()->getSuccessor(0) == forBody) { + flag = true; + } + } + } + } + } + return flag; +} +// for dotprod, llvm.fmuladd.f32 is in for.body +static bool checkIfDotProdComplicated(Function &F) { + bool flag1 = false; + bool flag2 = false; + bool flag3 = false; + if (F.size() == 3) { + BasicBlock *entryBB = getBasicBlockByName(F, "entry"); + BasicBlock *forCondCleanup = getBasicBlockByName(F, "for.cond.cleanup"); + BasicBlock *forBody = getBasicBlockByName(F, "for.body"); + if (entryBB && forCondCleanup && forBody) { + CallInst *fmuladd = getFirstCallInstWithName(forBody, "llvm.fmuladd.f32"); + if (fmuladd) { + + if (forBody->getTerminator()->getSuccessor(0) == forCondCleanup && + forBody->getTerminator()->getSuccessor(1) == forBody) { + if (entryBB->getTerminator()->getSuccessor(0) == forBody) { + flag1 = true; + } + } + } + } + if (forBody) { + for (Instruction &I : *forBody) { + if (auto *BinOp = dyn_cast(&I)) { + if (BinOp->getOpcode() == Instruction::FAdd || + BinOp->getOpcode() == Instruction::FMul || + BinOp->getOpcode() == Instruction::FSub || + BinOp->getOpcode() == Instruction::FDiv) { + flag2 = true; + } + } + } + + // Check if forBody has exactly one float PHI node + int floatPhiCount = 0; + for (PHINode &Phi : forBody->phis()) { + if (Phi.getType()->isFloatTy()) { + floatPhiCount++; + } + } + if (floatPhiCount == 1) { + flag3 = true; + } + } + } + + return flag1 && flag2 && flag3; +} +static bool shouldUnrollLoopWithCount(Function &F, Loop *L, + ScalarEvolution &SE) { + if (!checkIfDotProdSimplest(F)) { + return false; + } + // Check if the loop is suitable for unrolling + if (!L->getLoopLatch()) + return false; + if (!L->getExitingBlock()) + return false; + + // Check if the loop count is fixed and appropriate, loop count is constant + const SCEV *TripCount = SE.getBackedgeTakenCount(L); + if (isa(TripCount)) { + // More condition checks can be added here + return true; + } + return false; +} + +static void +insertPhiNodesForFMulAdd(BasicBlock *LoopHeader, BasicBlock *LoopPreheader, + SmallVector &FMulAddCalls) { + // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader + for (Instruction &I : *LoopHeader) { + if (CallInst *CI = dyn_cast(&I)) { + if (Function *F = CI->getCalledFunction()) { + if (F->getName() == "llvm.fmuladd.f32" && CI->isTailCall()) { + FMulAddCalls.push_back(CI); + } + } + } + } + + // Insert phi nodes for each FMulAdd call + for (CallInst *CI : FMulAddCalls) { + // Create new phi node + PHINode *PHI = + PHINode::Create(CI->getType(), 2, CI->getName() + ".phi", CI); + + // Set incoming values for phi node + PHI->addIncoming(ConstantFP::get(CI->getType(), 0), LoopPreheader); + PHI->addIncoming(CI, LoopHeader); + + CI->setOperand(2, PHI); + } +} + +static void postUnrollLoopWithCount(Function &F, Loop *L, int unroll_count) { + BasicBlock *LoopHeader = L->getHeader(); + BasicBlock *LoopPreheader = L->getLoopPreheader(); + // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader + SmallVector FMulAddCalls; + insertPhiNodesForFMulAdd(LoopHeader, LoopPreheader, FMulAddCalls); + + movePHINodesToTop(*LoopHeader); + modifyAddToOr(LoopHeader); + groupAndReorderInstructions(LoopHeader); + + // Create for.end basic block after LoopHeader + ICmpInst *LastICmp = getLastICmpInst(LoopHeader); + LastICmp->setPredicate(ICmpInst::ICMP_ULT); + // Get the first operand of LastICmp + Value *Operand1 = LastICmp->getOperand(1); + + // Directly set the first operand of LastICmp to a new constant value + LastICmp->setOperand( + 1, ConstantInt::get(Operand1->getType(), + dyn_cast(Operand1)->getSExtValue() - + (2 * unroll_count - 1))); + LastICmp->setName("cmp"); + + swapTerminatorSuccessors(LoopHeader); + + // After swapping, succ 0 is LoopHeader, succ 1 is returnBB + BasicBlock *ExitingBlock = L->getExitBlock(); + ExitingBlock->setName("for.end"); + + // Get ret instruction in ExitingBlock + ReturnInst *RetInst = dyn_cast(ExitingBlock->getTerminator()); + if (!RetInst) { + assert(false && "ret instruction not found\n"); + return; + } + + // Get the original return value + Value *OriginalRetValue = RetInst->getOperand(0); + + // Create IRBuilder, set insertion point before ret instruction + IRBuilder<> Builder(RetInst); + + // Create a series of fadd instructions + Value *CurrentSum = OriginalRetValue; + Value *add37 = Builder.CreateFAdd(FMulAddCalls[1], CurrentSum, "add37"); + Value *add38 = Builder.CreateFAdd(FMulAddCalls[2], FMulAddCalls[3], "add38"); + Value *add39 = Builder.CreateFAdd(FMulAddCalls[4], FMulAddCalls[5], "add39"); + Value *add40 = Builder.CreateFAdd(FMulAddCalls[6], FMulAddCalls[7], "add40"); + Value *add41 = Builder.CreateFAdd(add37, add38, "add41"); + Value *add42 = Builder.CreateFAdd(add39, add40, "add42"); + CurrentSum = Builder.CreateFAdd(add41, add42, "add43"); + + // Replace the original ret instruction + RetInst->setOperand(0, CurrentSum); + + // Verify function + if (verifyFunction(F, &errs())) { + LLVM_DEBUG(errs() << "Function verification failed\n"); + return; + } +} + +static bool shouldUnrollComplexLoop(Function &F, Loop *L, ScalarEvolution &SE, + DominatorTree &DT, LoopInfo &LI) { + if (!checkIfDotProdComplicated(F)) { + return false; + } + // Check if the loop is suitable for unrolling + if (!L->getLoopLatch()) + return false; + if (!L->getExitingBlock()) + return false; + + if (L->getCanonicalInductionVariable()) + return false; + // Check if the loop count is fixed and appropriate, loop count is constant + BasicBlock *LoopPreheader = L->getLoopPreheader(); + // Get the start value of the loop + if (LoopPreheader) { + return false; + } + + BasicBlock *LoopHeader = L->getHeader(); + BasicBlock *NewPreheader = + BasicBlock::Create(LoopHeader->getContext(), "for.cond.preheader", + LoopHeader->getParent(), LoopHeader); + // Redirect all external predecessors to the new preheader basic block + for (BasicBlock *pred : predecessors(LoopHeader)) { + if (!L->contains(pred)) { + pred->getTerminator()->replaceUsesOfWith(LoopHeader, NewPreheader); + // Update PHI nodes in the loop header to point to the new preheader basic + // block + for (PHINode &PN : LoopHeader->phis()) { + int Index = PN.getBasicBlockIndex(pred); + if (Index != -1) { + PN.setIncomingBlock(Index, NewPreheader); + } + } + } + } + // Jump from the new preheader to the loop header + BranchInst::Create(LoopHeader, NewPreheader); + return true; +} + +static bool shouldUnrollAddcType(Function &F, LoopInfo *LI) { + // Check the number of basic blocks + if (F.size() != 6) + return false; + + // Check the loop nesting level + unsigned int maxLoopDepth = 0; + for (auto &BB : F) { + maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB)); + } + if (maxLoopDepth != 1) { + return false; + } + + BasicBlock *Entry = getBasicBlockByName(F, "entry"); + BasicBlock *IfEnd = getBasicBlockByName(F, "if.end"); + BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader"); + BasicBlock *ForBody = getBasicBlockByName(F, "for.body"); + BasicBlock *ForBodyClone = getBasicBlockByName(F, "for.body.clone"); + BasicBlock *Return = getBasicBlockByName(F, "return"); + + if (!Entry || !IfEnd || !ForCondPreheader || !ForBody || !ForBodyClone || + !Return) + return false; + + if (Entry->getTerminator()->getSuccessor(0) != Return || + Entry->getTerminator()->getSuccessor(1) != IfEnd || + IfEnd->getTerminator()->getSuccessor(0) != ForBody || + IfEnd->getTerminator()->getSuccessor(1) != ForCondPreheader || + ForCondPreheader->getTerminator()->getSuccessor(0) != ForBodyClone || + ForCondPreheader->getTerminator()->getSuccessor(1) != Return || + ForBody->getTerminator()->getSuccessor(0) != Return || + ForBody->getTerminator()->getSuccessor(1) != ForBody || + ForBodyClone->getTerminator()->getSuccessor(0) != Return || + ForBodyClone->getTerminator()->getSuccessor(1) != ForBodyClone) + return false; + + // Check if there are three outer loops, each with one inner loop + int outerLoopCount = 0; + int innerLoopCount = 0; + for (Loop *L : LI->getLoopsInPreorder()) { + if (L->getLoopDepth() == 1) { + outerLoopCount++; + if (L->getSubLoops().size() == 1) { + innerLoopCount++; + } + } + } + + if (outerLoopCount != 2 || innerLoopCount != 0) { + return false; + } + + return true; +} + +static bool shouldUnrollDotprodType(Function &F, LoopInfo *LI) { + // Check the number of basic blocks + if (F.size() != 5) + return false; + + // Check the loop nesting level + unsigned int maxLoopDepth = 0; + for (auto &BB : F) { + maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB)); + } + if (maxLoopDepth != 1) { + return false; + } + + BasicBlock *Entry = getBasicBlockByName(F, "entry"); + BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader"); + BasicBlock *IfEnd = getBasicBlockByName(F, "if.end"); + BasicBlock *ForBody = getBasicBlockByName(F, "for.body"); + BasicBlock *ForBodyClone = getBasicBlockByName(F, "for.body.clone"); + + if (!Entry || !IfEnd || !ForCondPreheader || !ForBody || !ForBodyClone) + return false; + + if (Entry->getTerminator()->getSuccessor(0) != ForBody || + Entry->getTerminator()->getSuccessor(1) != ForCondPreheader || + ForCondPreheader->getTerminator()->getSuccessor(0) != ForBodyClone || + ForCondPreheader->getTerminator()->getSuccessor(1) != IfEnd || + ForBody->getTerminator()->getSuccessor(0) != IfEnd || + ForBody->getTerminator()->getSuccessor(1) != ForBody || + ForBodyClone->getTerminator()->getSuccessor(0) != IfEnd || + ForBodyClone->getTerminator()->getSuccessor(1) != ForBodyClone) + return false; + + // Check if there are three outer loops, each with one inner loop + int outerLoopCount = 0; + int innerLoopCount = 0; + for (Loop *L : LI->getLoopsInPreorder()) { + if (L->getLoopDepth() == 1) { + outerLoopCount++; + if (L->getSubLoops().size() == 1) { + innerLoopCount++; + } + } + } + + if (outerLoopCount != 2 || innerLoopCount != 0) { + return false; + } + + return true; +} + +static std::pair modifyEntryBB(BasicBlock &entryBB) { + ICmpInst *icmp = getLastICmpInst(&entryBB); + assert(icmp && "icmp not found"); + Value *start_index = icmp->getOperand(0); + Value *end_index = icmp->getOperand(1); + // Insert new instructions before icmp + IRBuilder<> Builder(icmp); + Value *sub = Builder.CreateNSWAdd( + end_index, ConstantInt::get(end_index->getType(), -8), "sub"); + icmp->setOperand(0, sub); + icmp->setOperand(1, start_index); + return std::make_pair(sub, end_index); +} + +static void postUnrollLoopWithVariable(Function &F, Loop *L, int unroll_count) { + BasicBlock *LoopPreheader = L->getLoopPreheader(); + // Get the basic blocks to merge + SmallVector BBsToMerge; + BasicBlock *ForBody1 = getBasicBlockByName(F, "for.body.1"); + BasicBlock *ForBody2 = getBasicBlockByName(F, "for.body.2"); + BasicBlock *ForBody3 = getBasicBlockByName(F, "for.body.3"); + BasicBlock *ForBody4 = getBasicBlockByName(F, "for.body.4"); + BasicBlock *ForBody5 = getBasicBlockByName(F, "for.body.5"); + BasicBlock *ForBody6 = getBasicBlockByName(F, "for.body.6"); + BasicBlock *ForBody7 = getBasicBlockByName(F, "for.body.7"); + assert(ForBody1 && ForBody2 && ForBody3 && ForBody4 && ForBody5 && ForBody6 && + ForBody7 && "basic block not found"); + BBsToMerge.push_back(ForBody1); + BBsToMerge.push_back(ForBody2); + BBsToMerge.push_back(ForBody3); + BBsToMerge.push_back(ForBody4); + BBsToMerge.push_back(ForBody5); + BBsToMerge.push_back(ForBody6); + BBsToMerge.push_back(ForBody7); + + BasicBlock *LoopHeader = L->getHeader(); + BasicBlock *LoopHeaderClone = + cloneBasicBlockWithRelations(LoopHeader, ".clone", &F); + LoopHeaderClone->moveAfter(LoopHeader); + // Create a new basic block as for.end + BasicBlock *ForEnd = getBasicBlockByName(F, "for.cond.cleanup"); + assert(ForEnd && "basic block not found"); + ForEnd->setName("for.end"); + + LoopHeaderClone->getTerminator()->setSuccessor(1, LoopHeaderClone); + for (PHINode &Phi : LoopHeaderClone->phis()) { + Phi.setIncomingBlock(1, LoopHeaderClone); + } + + for (BasicBlock *BB : BBsToMerge) { + MergeBasicBlockIntoOnlyPred(BB); + } + + // Adjust positions + LoopHeaderClone->moveAfter(getBasicBlockByName(F, "for.body.7")); + assert(LoopHeaderClone && "basic block not found"); + ForEnd->moveAfter(LoopHeaderClone); + + BasicBlock &entryBB = F.getEntryBlock(); + auto [Sub, end_index] = modifyEntryBB(entryBB); + entryBB.getTerminator()->setSuccessor(1, ForBody7); + + SmallVector FAMSDInsts; + for (Instruction &I : *ForBody7) { + if (auto *BinOp = dyn_cast(&I)) { + if (BinOp->getOpcode() == Instruction::FAdd || + BinOp->getOpcode() == Instruction::FMul || + BinOp->getOpcode() == Instruction::FSub || + BinOp->getOpcode() == Instruction::FDiv) { + FAMSDInsts.push_back(BinOp); + } + } + } + assert(!FAMSDInsts.empty() && "fadd/fmul/fsub/fdiv instruction not found"); + PHINode *firstFloatPhi = getFirstFloatPhi(ForBody7); + assert(firstFloatPhi && "phi node not found"); + // Clone phi node 7 times + for (int i = 0; i < 7; i++) { + PHINode *clonedPhi = cast(firstFloatPhi->clone()); + clonedPhi->setName("result" + Twine(i)); + clonedPhi->insertAfter(firstFloatPhi); + auto *temp = FAMSDInsts[i]; + clonedPhi->setIncomingValue(1, temp); + temp->setOperand(0, clonedPhi); + } + + for (PHINode &Phi : ForBody7->phis()) { + Phi.setIncomingBlock(0, &entryBB); + auto *temp = Phi.clone(); + temp->setName("result0.0.lcssa"); + temp->insertBefore(LoopPreheader->getTerminator()); + } + + ICmpInst *lastICmp = getLastICmpInst(ForBody7); + assert(lastICmp && "icmp not found"); + lastICmp->setOperand(1, Sub); + lastICmp->setPredicate(ICmpInst::ICMP_SLT); + + ForBody7->getTerminator()->setSuccessor(0, LoopPreheader); + ForBody7->getTerminator()->setSuccessor(1, ForBody7); + + PHINode *firstI32Phi = getFirstI32Phi(LoopPreheader); + assert(firstI32Phi && "phi node not found"); + // Insert icmp slt instruction in LoopPreheader + IRBuilder<> Builder(LoopPreheader->getTerminator()); + ICmpInst *NewICmp = + cast(Builder.CreateICmpSLT(firstI32Phi, end_index, "cmp")); + + // Convert the original unconditional branch to a conditional branch + BranchInst *OldBr = cast(LoopPreheader->getTerminator()); + BranchInst *NewBr = BranchInst::Create(LoopHeaderClone, ForEnd, NewICmp); + ReplaceInstWithInst(OldBr, NewBr); + + Instruction *faddInst = nullptr; + Instruction *addNswInst = nullptr; + + for (auto &I : *LoopHeaderClone) { + if (auto *BinOp = dyn_cast(&I)) { + if ((BinOp->getOpcode() == Instruction::FAdd || + BinOp->getOpcode() == Instruction::FMul || + BinOp->getOpcode() == Instruction::FSub || + BinOp->getOpcode() == Instruction::FDiv) && + BinOp->getType()->isFloatTy()) { + faddInst = BinOp; + } else if (BinOp->getOpcode() == Instruction::Add && + BinOp->hasNoSignedWrap()) { + addNswInst = BinOp; + } + } + + if (faddInst && addNswInst) { + break; + } + } + assert(faddInst && addNswInst && + "fadd/fmul/fsub/fdiv float and add nsw instructions not found"); + PHINode *firstI32PhiLoopHeaderClone = getFirstI32Phi(LoopHeaderClone); + assert(firstI32PhiLoopHeaderClone && "phi node not found"); + firstI32PhiLoopHeaderClone->setIncomingValue(0, firstI32Phi); + firstI32PhiLoopHeaderClone->setIncomingValue(1, addNswInst); + + PHINode *firstFloatPhiLoopHeaderClone = getFirstFloatPhi(LoopHeaderClone); + assert(firstFloatPhiLoopHeaderClone && "phi node not found"); + PHINode *lastFloatPhiLoopPreheader = getLastFloatPhi(LoopPreheader); + assert(lastFloatPhiLoopPreheader && "phi node not found"); + firstFloatPhiLoopHeaderClone->setIncomingValue(0, lastFloatPhiLoopPreheader); + firstFloatPhiLoopHeaderClone->setIncomingValue(1, faddInst); + + // Collect all phi float instructions in LoopPreheader + SmallVector floatPhis; + for (auto &I : *LoopPreheader) { + if (auto *Phi = dyn_cast(&I)) { + if (Phi->getType()->isFloatTy()) { + floatPhis.push_back(Phi); + } + } + } + + // Get the ret instruction in ExitingBlock + ReturnInst *RetInst = dyn_cast(ForEnd->getTerminator()); + if (!RetInst) { + assert(false && "ret instruction not found in ExitingBlock"); + return; + } + + // Get the original return value + Value *OriginalRetValue = RetInst->getOperand(0); + + // Create IRBuilder, set insertion point before the ret instruction + + Builder.SetInsertPoint(RetInst); + // Create a series of fadd instructions + assert(floatPhis.size() == 8 && "expected floatPhis has 8 phi node"); + Value *CurrentSum = nullptr; + Value *add64 = Builder.CreateFAdd(floatPhis[0], OriginalRetValue, "add64"); + Value *add65 = Builder.CreateFAdd(floatPhis[1], floatPhis[2], "add65"); + Value *add66 = Builder.CreateFAdd(floatPhis[3], floatPhis[4], "add66"); + Value *add67 = Builder.CreateFAdd(floatPhis[5], floatPhis[6], "add67"); + Value *add68 = Builder.CreateFAdd(add64, add65, "add68"); + Value *add69 = Builder.CreateFAdd(add66, add67, "add69"); + CurrentSum = Builder.CreateFAdd(add68, add69, "add70"); + + // Replace the original ret instruction + RetInst->setOperand(0, CurrentSum); + PHINode *firstFloatPhiForEnd = getFirstFloatPhi(ForEnd); + assert(firstFloatPhiForEnd && "phi node not found"); + // Remove existing incoming values from firstFloatPhiForEnd + while (firstFloatPhiForEnd->getNumIncomingValues() > 0) { + firstFloatPhiForEnd->removeIncomingValue(0u, false); + } + // Add two incoming values to firstFloatPhiForEnd + firstFloatPhiForEnd->addIncoming(faddInst, LoopHeaderClone); + firstFloatPhiForEnd->addIncoming(lastFloatPhiLoopPreheader, LoopPreheader); + + runDeadCodeElimination(F); +} + +static bool shouldUnrollCorr(Function &F, LoopInfo *LI) { + if (F.size() != 7) + return false; + + BasicBlock *Entry = getBasicBlockByName(F, "entry"); + BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader"); + BasicBlock *Return = getBasicBlockByName(F, "return"); + + if (!Entry || !ForCondPreheader || !Return) + return false; + + if (Entry->getTerminator()->getSuccessor(0) != Return || + Entry->getTerminator()->getSuccessor(1) != ForCondPreheader) { + return false; + } + + // Feature 2: Has 5 parameters + if (F.arg_size() != 5) { + return false; + } + + unsigned int loopNestLevel = 0; + for (auto &BB : F) { + if (isa(BB.getTerminator())) { + loopNestLevel = std::max(loopNestLevel, LI->getLoopDepth(&BB)); + } + } + if (loopNestLevel != 2) { + return false; + } + + bool hasFMulAdd = false; + for (auto &BB : F) { + for (auto &I : BB) { + if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) { + hasFMulAdd = true; + break; + } + } + if (hasFMulAdd) + break; + } + if (!hasFMulAdd) { + return false; + } + + return true; +} + +static bool shouldUnrollConvccorr(Function &F, LoopInfo *LI) { + // Check the number of basic blocks + if (F.size() != 17) + return false; + + // Check the number of parameters + if (F.arg_size() != 5) { + return false; + } + + // Check the loop nesting level + unsigned int maxLoopDepth = 0; + for (auto &BB : F) { + maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB)); + } + if (maxLoopDepth != 2) { + return false; + } + + // Check if the fmuladd.f32 inline function is used + bool hasFMulAdd = false; + for (auto &BB : F) { + for (auto &I : BB) { + if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) { + hasFMulAdd = true; + break; + } + } + if (hasFMulAdd) + break; + } + if (!hasFMulAdd) { + return false; + } + + BasicBlock *Entry = getBasicBlockByName(F, "entry"); + BasicBlock *ForBody = getBasicBlockByName(F, "for.body"); + BasicBlock *ForEnd = getBasicBlockByName(F, "for.end"); + BasicBlock *Return = getBasicBlockByName(F, "return"); + + if (!Entry || !ForBody || !ForEnd || !Return) + return false; + + if (Entry->getTerminator()->getSuccessor(0) != Return || + ForEnd->getTerminator()->getSuccessor(1) != ForBody) + return false; + + // Check if there are three outer loops, each with one inner loop + int outerLoopCount = 0; + int innerLoopCount = 0; + for (Loop *L : LI->getLoopsInPreorder()) { + if (L->getLoopDepth() == 1) { + outerLoopCount++; + if (L->getSubLoops().size() == 1) { + innerLoopCount++; + } + } + } + + if (outerLoopCount != 3 || innerLoopCount != 3) { + return false; + } + + // Check if there are three icmp eq instructions in the entry basic block + int icmpEqCount = 0; + for (auto &I : *Entry) { + if (auto *ICmp = dyn_cast(&I)) { + if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) { + icmpEqCount++; + } + } + } + + if (icmpEqCount != 3) { + return false; + } + + return true; +} + +static bool shouldUnrollFird(Function &F, LoopInfo *LI) { + + // Check the number of basic blocks + if (F.size() != 14) + return false; + + // Check the number of parameters + if (F.arg_size() != 4) { + return false; + } + + // Check the loop nesting level + unsigned int maxLoopDepth = 0; + for (auto &BB : F) { + maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB)); + } + if (maxLoopDepth != 2) { + return false; + } + + // Check if the fmuladd.f32 inline function is used + bool hasFMulAdd = false; + for (auto &BB : F) { + for (auto &I : BB) { + if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) { + hasFMulAdd = true; + break; + } + } + if (hasFMulAdd) + break; + } + if (!hasFMulAdd) { + return false; + } + + BasicBlock *Entry = getBasicBlockByName(F, "entry"); + BasicBlock *ForCondCleanup = getBasicBlockByName(F, "for.cond.cleanup"); + + if (!Entry || !ForCondCleanup) + return false; + + if (Entry->getTerminator()->getSuccessor(1) != ForCondCleanup) + return false; + + // Check if there are three outer loops, each with one inner loop + int outerLoopCount = 0; + int innerLoopCount = 0; + for (Loop *L : LI->getLoopsInPreorder()) { + if (L->getLoopDepth() == 1) { + outerLoopCount++; + } else if (L->getLoopDepth() == 2) { + innerLoopCount++; + } else { + return false; + } + } + + if (outerLoopCount != 1 || innerLoopCount != 3) { + return false; + } + + return true; +} + +static bool shouldUnrollFirType(Function &F, LoopInfo *LI) { + // Check the number of basic blocks + if (F.size() != 19) + return false; + + // Check the number of parameters + if (F.arg_size() != 4) { + return false; + } + + // Check the loop nesting level + unsigned int maxLoopDepth = 0; + for (auto &BB : F) { + maxLoopDepth = std::max(maxLoopDepth, LI->getLoopDepth(&BB)); + } + if (maxLoopDepth != 2) { + return false; + } + + // Check if the fmuladd.f32 inline function is used + bool hasFMulAdd = false; + for (auto &BB : F) { + for (auto &I : BB) { + if (RecurrenceDescriptor::isFMulAddIntrinsic(&I)) { + hasFMulAdd = true; + break; + } + } + if (hasFMulAdd) + break; + } + if (!hasFMulAdd) { + return false; + } + + BasicBlock *Entry = getBasicBlockByName(F, "entry"); + BasicBlock *ForCondPreheader = getBasicBlockByName(F, "for.cond.preheader"); + BasicBlock *ForBodyLrPh = getBasicBlockByName(F, "for.body.lr.ph"); + BasicBlock *IfEnd = getBasicBlockByName(F, "if.end"); + BasicBlock *ForBody = getBasicBlockByName(F, "for.body"); + BasicBlock *ForBodyClone = getBasicBlockByName(F, "for.body.clone"); + BasicBlock *ForBodyLrPhClone = getBasicBlockByName(F, "for.body.lr.ph.clone"); + + if (!Entry || !ForCondPreheader || !ForBodyLrPh || !IfEnd || !ForBody || + !ForBodyClone || !ForBodyLrPhClone) + return false; + + if (Entry->getTerminator()->getSuccessor(0) != ForCondPreheader || + Entry->getTerminator()->getSuccessor(1) != ForBodyLrPhClone || + ForCondPreheader->getTerminator()->getSuccessor(0) != ForBodyLrPh || + ForCondPreheader->getTerminator()->getSuccessor(1) != IfEnd || + ForBodyLrPh->getSingleSuccessor() != ForBody || + ForBodyLrPhClone->getSingleSuccessor() != ForBodyClone) + return false; + + // Check if there are three outer loops, each with one inner loop + int outerLoopCount = 0; + int innerLoopCount = 0; + for (Loop *L : LI->getLoopsInPreorder()) { + if (L->getLoopDepth() == 1) { + outerLoopCount++; + } else if (L->getLoopDepth() == 2) { + innerLoopCount++; + } else { + return false; + } + } + // for opt is 4, for clang is 2. + if (outerLoopCount != 2 || (innerLoopCount != 2 && innerLoopCount != 4)) { + return false; + } + + return true; +} + +static void eraseAllStoreInstInBB(BasicBlock *BB) { + assert(BB && "BasicBlock is nullptr"); + // Erase all store instructions in BB + for (auto it = BB->begin(); it != BB->end();) { + if (isa(&*it)) { + it = it->eraseFromParent(); + } else { + ++it; + } + } +} + +static GetElementPtrInst *getUniqueGetElementPtrInst(BasicBlock *BB) { + assert(BB && "BasicBlock is nullptr"); + // Get the unique getelementptr instruction in BB + GetElementPtrInst *GEP = nullptr; + for (Instruction &I : *BB) { + if (auto *GEPI = dyn_cast(&I)) { + if (!GEP) { + GEP = GEPI; + } else { + // If multiple getelementptr instructions are found, set GEP to nullptr + // and exit the loop + GEP = nullptr; + break; + } + } + } + assert(GEP && "getelementptr instruction not found"); + return GEP; +} + +static void createCriticalEdgeAndMoveStoreInst(BasicBlock *CloneForBody, + BasicBlock *ForEnd37) { + CloneForBody->getTerminator()->setSuccessor(1, CloneForBody); + // Create a new BasicBlock: for.cond.for.end_crit_edge + BasicBlock *CriticalEdge = BasicBlock::Create( + CloneForBody->getContext(), "for.cond.for.end_crit_edge", + CloneForBody->getParent(), ForEnd37); + + // Update the terminator instruction of CloneForBody + CloneForBody->getTerminator()->setSuccessor(0, CriticalEdge); + + // Create an unconditional branch instruction to jump to OldForEnd + BranchInst::Create(ForEnd37, CriticalEdge); + + // Find and move the StoreInst in CloneForBody to CriticalEdge + StoreInst *StoreToMove = nullptr; + for (auto &Inst : *CloneForBody) { + if (auto *Store = dyn_cast(&Inst)) { + StoreToMove = Store; + break; + } + } + + if (StoreToMove) { + StoreToMove->removeFromParent(); + StoreToMove->insertBefore(CriticalEdge->getTerminator()); + } +} +static std::tuple +modifyOuterLoop4(Loop *L, BasicBlock *ForBodyMerged, + BasicBlock *CloneForBodyPreheader) { + BasicBlock *BB = L->getHeader(); + PHINode *phi = getLastPhi(BB); + // Add new instructions + IRBuilder<> Builder(BB); + Builder.SetInsertPoint(phi->getNextNode()); + + // and i32 %n.0551, -8 + Value *Add2 = Builder.CreateAnd(phi, ConstantInt::get(phi->getType(), -8)); + + // %sub = and i32 %n.0551, 2147483644 + Value *Sub = + Builder.CreateAnd(phi, ConstantInt::get(phi->getType(), 2147483640)); + + // %cmp12538.not = icmp eq i32 %sub, 0 + Value *Cmp = Builder.CreateICmpEQ(Sub, ConstantInt::get(phi->getType(), 0)); + + // br i1 %cmp12538.not, label %for.cond.cleanup, label %for.body.preheader + // Move the conditional branch instruction to the end of BB + auto *newcondBr = + Builder.CreateCondBr(Cmp, CloneForBodyPreheader, ForBodyMerged); + + // Erase the terminator instruction of BB + Instruction *oldTerminator = BB->getTerminator(); + newcondBr->moveAfter(oldTerminator); + oldTerminator->eraseFromParent(); + + // Erase all store instructions in BB + eraseAllStoreInstInBB(BB); + for (PHINode &Phi : ForBodyMerged->phis()) { + Phi.setIncomingBlock(1, CloneForBodyPreheader); + } + // Get the unique getelementptr instruction in BB + GetElementPtrInst *GEP = getUniqueGetElementPtrInst(BB); + return std::make_tuple(Sub, GEP, Add2); +} + +static void modifyInnerLoop4(Loop *L, BasicBlock *ForBodyMerged, Value *Sub, + BasicBlock *CloneForBody, GetElementPtrInst *GEP, + Value *Add2, BasicBlock *CloneForBodyPreheader) { + BasicBlock *OuterBB = L->getHeader(); + SmallVector FMulAddCalls; + insertPhiNodesForFMulAdd(ForBodyMerged, OuterBB, FMulAddCalls); + movePHINodesToTop(*ForBodyMerged); + + groupAndReorderInstructions(ForBodyMerged); + ICmpInst *LastICmp = getLastICmpInst(ForBodyMerged); + LastICmp->setPredicate(ICmpInst::ICMP_ULT); + LastICmp->setOperand(1, Sub); + swapTerminatorSuccessors(ForBodyMerged); + eraseAllStoreInstInBB(ForBodyMerged); + + Function *F = ForBodyMerged->getParent(); + + BasicBlock *NewForEnd = + BasicBlock::Create(F->getContext(), "for.end", F, ForBodyMerged); + NewForEnd->moveAfter(ForBodyMerged); + + // Create an instruction to add the results of four FMulAdd calls + assert(FMulAddCalls.size() == 8 && "Expected 8 FMulAdd calls"); + Value *Sum = nullptr; + Value *sum = BinaryOperator::CreateFAdd(FMulAddCalls[0], FMulAddCalls[1], + "sum", NewForEnd); + Value *sum23 = BinaryOperator::CreateFAdd(FMulAddCalls[2], FMulAddCalls[3], + "sum23", NewForEnd); + Value *sum24 = BinaryOperator::CreateFAdd(FMulAddCalls[4], FMulAddCalls[5], + "sum24", NewForEnd); + Value *sum25 = BinaryOperator::CreateFAdd(FMulAddCalls[6], FMulAddCalls[7], + "sum25", NewForEnd); + Value *sum26 = BinaryOperator::CreateFAdd(sum, sum23, "sum26", NewForEnd); + Value *sum27 = BinaryOperator::CreateFAdd(sum24, sum25, "sum27", NewForEnd); + Sum = BinaryOperator::CreateFAdd(sum26, sum27, "sum28", NewForEnd); + IRBuilder<> Builder(NewForEnd); + Builder.SetInsertPoint(NewForEnd); + // Create a new StoreInst instruction + Builder.CreateStore(Sum, GEP); + // Create a comparison instruction + Value *Cmp = Builder.CreateICmpUGT(Add2, GEP->getOperand(1), "cmp37.not548"); + + // Create a conditional branch instruction + Builder.CreateCondBr(Cmp, ForBodyMerged->getTerminator()->getSuccessor(1), + CloneForBodyPreheader); + ForBodyMerged->getTerminator()->setSuccessor(1, NewForEnd); + CloneForBodyPreheader->moveAfter(NewForEnd); + CloneForBody->moveAfter(CloneForBodyPreheader); + + // Create a PHI node in CloneForBodyPreheader + PHINode *SumPHI = PHINode::Create(Sum->getType(), 2, "sum.phi", + CloneForBodyPreheader->getFirstNonPHI()); + + // Set the incoming values of the PHI node + SumPHI->addIncoming(ConstantFP::get(Sum->getType(), 0.0), OuterBB); + SumPHI->addIncoming(Sum, NewForEnd); + + // Create a PHI node in CloneForBodyPreheader + PHINode *AddPHI = PHINode::Create(Add2->getType(), 2, "add.phi", + CloneForBodyPreheader->getFirstNonPHI()); + + // Set the incoming values of the PHI node + AddPHI->addIncoming(ConstantInt::get(Add2->getType(), 0), OuterBB); + AddPHI->addIncoming(Add2, NewForEnd); + Value *phifloatincomingvalue0 = + getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32"); + Value *phii32incomingvalue0 = getLastICmpInst(CloneForBody)->getOperand(0); + for (PHINode &Phi : CloneForBody->phis()) { + if (Phi.getType()->isIntegerTy(32)) { + Phi.setIncomingValue(0, AddPHI); + Phi.setIncomingBlock(0, CloneForBodyPreheader); + Phi.setIncomingValue(1, phii32incomingvalue0); + Phi.setIncomingBlock(1, CloneForBody); + } else if (Phi.getType()->isFloatTy()) { + Phi.setIncomingValue(0, SumPHI); + Phi.setIncomingBlock(0, CloneForBodyPreheader); + Phi.setIncomingValue(1, phifloatincomingvalue0); + Phi.setIncomingBlock(1, CloneForBody); + } + } + BasicBlock *OldForEnd = CloneForBody->getTerminator()->getSuccessor(0); + createCriticalEdgeAndMoveStoreInst(CloneForBody, OldForEnd); + + getFirstI32Phi(ForBodyMerged)->setIncomingBlock(1, ForBodyMerged); +} + +static std::tuple +modifyOuterLoop8(Loop *L) { + BasicBlock *BB = L->getHeader(); + ICmpInst *LastICmp = getLastICmpInst(BB); + LastICmp->setPredicate(ICmpInst::ICMP_ULT); + swapTerminatorSuccessors(BB); + + eraseAllStoreInstInBB(BB); + Value *lsig_0 = getFirstI32Phi(BB)->getIncomingValue(0); + Value *add207 = LastICmp->getOperand(0); + Value *sub206 = cast(add207)->getOperand(0); + // Add new instructions before LastICmp + IRBuilder<> Builder(LastICmp); + + // %add207.neg = xor i32 %sub206, -1 + Value *Add207Neg = Builder.CreateXor( + sub206, ConstantInt::get(sub206->getType(), -1), "add207.neg"); + + // %add211 = add i32 %lsig.0, %add207.neg + Value *Add211 = Builder.CreateAdd(lsig_0, Add207Neg, "add211"); + + // %div212535 = and i32 %add211, -8 + Value *Div212535 = Builder.CreateAnd( + Add211, ConstantInt::get(Add211->getType(), -8), "div212535"); + + // %add214 = add i32 %div212535, %add207 + Value *Add214 = Builder.CreateAdd(Div212535, add207, "add214"); + + // Set the second operand of LastICmp to Add214 + LastICmp->setOperand(1, Add214); + + // Get the unique getelementptr instruction in BB + GetElementPtrInst *GEP = getUniqueGetElementPtrInst(BB); + + return std::make_tuple(Add214, add207, GEP); +} + +static std::tuple +modifyOuterLoop16(Loop *L) { + BasicBlock *BB = L->getHeader(); + BasicBlock *BBLoopPreHeader = L->getLoopPreheader(); + ICmpInst *LastICmp = getLastICmpInst(BB); + LastICmp->setPredicate(ICmpInst::ICMP_ULT); + swapTerminatorSuccessors(BB); + + eraseAllStoreInstInBB(BB); + Value *lkern_0 = getFirstI32Phi(BB)->getIncomingValue(1); + // Insert an and instruction in BBLoopPreHeader + IRBuilder<> Builder(BBLoopPreHeader->getTerminator()); + Value *Div536 = Builder.CreateAnd(lkern_0, -16, "div536"); + // Get the first operand of LastICmp + Value *Add56 = LastICmp->getOperand(0); + + // Create an add instruction before LastICmp + Builder.SetInsertPoint(LastICmp); + Value *Add60 = Builder.CreateAdd(Div536, Add56, "add60"); + + // Set the second operand of LastICmp to Add60 + LastICmp->setOperand(1, Add60); + + // Get the unique getelementptr instruction in BB + GetElementPtrInst *GEP = getUniqueGetElementPtrInst(BB); + + return std::make_tuple(Add60, Add56, GEP); +} + +static void modifyInnerLoop(Loop *L, BasicBlock *ForBodyMerged, Value *Add60, + BasicBlock *CloneForBody, Value *Add56, + GetElementPtrInst *GEP, uint32_t unroll_count) { + assert((unroll_count == 8 || unroll_count == 16) && + "unroll_count must be 8 or 16"); + BasicBlock *OuterBB = L->getHeader(); + + // Find the predecessor BasicBlock of ForBodyMergedPreheader + BasicBlock *PredBB = ForBodyMerged->getSinglePredecessor(); + if (!PredBB) { + // If there is no single predecessor, traverse all predecessors + for (BasicBlock *Pred : predecessors(ForBodyMerged)) { + PredBB = Pred; + break; // Take the first predecessor + } + } + assert(PredBB && "can't find predecessor of ForBodyMerged"); + + SmallVector FMulAddCalls; + insertPhiNodesForFMulAdd(ForBodyMerged, PredBB, FMulAddCalls); + + movePHINodesToTop(*ForBodyMerged); + + groupAndReorderInstructions(ForBodyMerged); + ICmpInst *LastICmp = getLastICmpInst(ForBodyMerged); + LastICmp->setPredicate(ICmpInst::ICMP_ULT); + LastICmp->setOperand(1, Add60); + swapTerminatorSuccessors(ForBodyMerged); + eraseAllStoreInstInBB(ForBodyMerged); + + BasicBlock *ForEndLoopExit = ForBodyMerged->getTerminator()->getSuccessor(1); + // Create an instruction to add the results of four FMulAdd calls + Value *Sum = nullptr; + if (unroll_count == 16) { + Value *sum45 = + BinaryOperator::CreateFAdd(FMulAddCalls[0], FMulAddCalls[1], "sum45", + ForEndLoopExit->getTerminator()); + Value *sum46 = + BinaryOperator::CreateFAdd(FMulAddCalls[2], FMulAddCalls[3], "sum46", + ForEndLoopExit->getTerminator()); + Value *sum47 = + BinaryOperator::CreateFAdd(FMulAddCalls[4], FMulAddCalls[5], "sum47", + ForEndLoopExit->getTerminator()); + Value *sum48 = + BinaryOperator::CreateFAdd(FMulAddCalls[6], FMulAddCalls[7], "sum48", + ForEndLoopExit->getTerminator()); + Value *sum49 = + BinaryOperator::CreateFAdd(FMulAddCalls[8], FMulAddCalls[9], "sum49", + ForEndLoopExit->getTerminator()); + Value *sum50 = + BinaryOperator::CreateFAdd(FMulAddCalls[10], FMulAddCalls[11], "sum50", + ForEndLoopExit->getTerminator()); + Value *sum51 = + BinaryOperator::CreateFAdd(FMulAddCalls[12], FMulAddCalls[13], "sum51", + ForEndLoopExit->getTerminator()); + Value *sum52 = + BinaryOperator::CreateFAdd(FMulAddCalls[14], FMulAddCalls[15], "sum52", + ForEndLoopExit->getTerminator()); + + Value *sum53 = BinaryOperator::CreateFAdd(sum45, sum46, "sum53", + ForEndLoopExit->getTerminator()); + Value *sum54 = BinaryOperator::CreateFAdd(sum47, sum48, "sum54", + ForEndLoopExit->getTerminator()); + Value *sum55 = BinaryOperator::CreateFAdd(sum49, sum50, "sum55", + ForEndLoopExit->getTerminator()); + Value *sum56 = BinaryOperator::CreateFAdd(sum51, sum52, "sum56", + ForEndLoopExit->getTerminator()); + + Value *sum57 = BinaryOperator::CreateFAdd(sum53, sum54, "sum57", + ForEndLoopExit->getTerminator()); + Value *sum58 = BinaryOperator::CreateFAdd(sum55, sum56, "sum58", + ForEndLoopExit->getTerminator()); + + Sum = BinaryOperator::CreateFAdd(sum57, sum58, "sum59", + ForEndLoopExit->getTerminator()); + } else if (unroll_count == 8) { + Value *sum60 = + BinaryOperator::CreateFAdd(FMulAddCalls[0], FMulAddCalls[1], "sum60", + ForEndLoopExit->getTerminator()); + Value *sum61 = + BinaryOperator::CreateFAdd(FMulAddCalls[2], FMulAddCalls[3], "sum61", + ForEndLoopExit->getTerminator()); + Value *sum62 = + BinaryOperator::CreateFAdd(FMulAddCalls[4], FMulAddCalls[5], "sum62", + ForEndLoopExit->getTerminator()); + Value *sum63 = + BinaryOperator::CreateFAdd(FMulAddCalls[6], FMulAddCalls[7], "sum63", + ForEndLoopExit->getTerminator()); + + Value *sum64 = BinaryOperator::CreateFAdd(sum60, sum61, "sum64", + ForEndLoopExit->getTerminator()); + Value *sum65 = BinaryOperator::CreateFAdd(sum62, sum63, "sum65", + ForEndLoopExit->getTerminator()); + Sum = BinaryOperator::CreateFAdd(sum64, sum65, "sum66", + ForEndLoopExit->getTerminator()); + } + + // Create a new basic block for.end164 + BasicBlock *ForEnd164 = BasicBlock::Create( + ForEndLoopExit->getContext(), "for.end164", ForEndLoopExit->getParent(), + ForEndLoopExit->getNextNode()); + + // Set the target of the terminator instruction of ForEndLoopExit to + // for.end164 + Instruction *Terminator = ForEndLoopExit->getTerminator(); + BasicBlock *OldSuccessor = Terminator->getSuccessor(0); + Terminator->setSuccessor(0, ForEnd164); + + // Create an unconditional branch instruction in for.end164, jumping to the + // original successor basic block + BranchInst::Create(OldSuccessor, ForEnd164); + + // Create a new phi node in for.end164 + PHINode *PhiSum = PHINode::Create(Type::getInt32Ty(ForEnd164->getContext()), + 2, "phi.sum", ForEnd164->getFirstNonPHI()); + + // Set the incoming values of the phi node + PhiSum->addIncoming(Add56, OuterBB); + PhiSum->addIncoming(LastICmp->getOperand(0), ForEndLoopExit); + + // Create a new phi float node in for.end164 + PHINode *PhiFloat = + PHINode::Create(Type::getFloatTy(ForEnd164->getContext()), 2, "phi.float", + ForEnd164->getFirstNonPHI()); + + // Set the incoming values of the phi node + PhiFloat->addIncoming( + ConstantFP::get(Type::getFloatTy(ForEnd164->getContext()), 0.0), OuterBB); + PhiFloat->addIncoming(Sum, ForEndLoopExit); + // Create a new StoreInst instruction in for.end164 + new StoreInst(PhiFloat, GEP, ForEnd164->getTerminator()); + + Value *operand1 = unroll_count == 16 + ? getFirstI32Phi(OuterBB) + : getLastICmpInst(CloneForBody)->getOperand(1); + // Create a new comparison instruction + ICmpInst *NewCmp = + new ICmpInst(ICmpInst::ICMP_UGT, PhiSum, operand1, "cmp182.not587"); + NewCmp->insertBefore(ForEnd164->getTerminator()); + + // Replace the original unconditional branch with a conditional branch + BranchInst *OldBr = cast(ForEnd164->getTerminator()); + BasicBlock *ForEnd37 = OldBr->getSuccessor(0); + BranchInst *NewBr = BranchInst::Create(ForEnd37, CloneForBody, NewCmp); + ReplaceInstWithInst(OldBr, NewBr); + + CloneForBody->moveAfter(ForEnd164); + Instruction *TargetInst = + getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32"); + for (PHINode &Phi : CloneForBody->phis()) { + if (Phi.getType()->isIntegerTy(32)) { + Phi.setIncomingValue(0, getLastICmpInst(CloneForBody)->getOperand(0)); + Phi.setIncomingBlock(0, CloneForBody); + Phi.setIncomingValue(1, PhiSum); + Phi.setIncomingBlock(1, ForEnd164); + } else if (Phi.getType()->isFloatTy()) { + Phi.setIncomingValue(0, TargetInst); + Phi.setIncomingBlock(0, CloneForBody); + Phi.setIncomingValue(1, PhiFloat); + Phi.setIncomingBlock(1, ForEnd164); + } + } + + createCriticalEdgeAndMoveStoreInst(CloneForBody, ForEnd37); + + OuterBB->getTerminator()->setSuccessor(1, ForEnd164); +} + +static void PostUnrollConv(Function &F, Loop *L, int unroll_count, + int unroll_index) { + BasicBlock *ForBody = L->getHeader(); + BasicBlock *CloneForBody = + cloneBasicBlockWithRelations(ForBody, ".clone", &F); + CloneForBody->moveAfter(ForBody); + // Set the second branch of the terminator instruction of CloneForBody to + // ForBody + CloneForBody->getTerminator()->setSuccessor(1, ForBody); + + StringRef ForBodyName = ForBody->getName(); + // Get the basic blocks to merge + std::vector BBsToMerge; + for (int i = 1; i < unroll_count; ++i) { + std::string BBName = (ForBodyName + "." + std::to_string(i)).str(); + BasicBlock *ForBodyClone = getBasicBlockByName(F, BBName); + if (ForBodyClone) { + BBsToMerge.push_back(ForBodyClone); + } + } + + if (BBsToMerge.size() == static_cast(unroll_count - 1)) { + for (BasicBlock *BB : BBsToMerge) { + MergeBasicBlockIntoOnlyPred(BB); + } + } + // Get the outer loop of L + Loop *OuterLoop = L->getParentLoop(); + if (unroll_count == 8 && unroll_index == 0) { + BasicBlock *CloneForBodyPreheader = BasicBlock::Create( + CloneForBody->getContext(), CloneForBody->getName() + ".preheader", + CloneForBody->getParent(), CloneForBody); + + updatePredecessorsToPreheader(CloneForBody, CloneForBodyPreheader); + auto [Sub, GEP, Add2] = + modifyOuterLoop4(OuterLoop, BBsToMerge[6], CloneForBodyPreheader); + modifyInnerLoop4(OuterLoop, BBsToMerge[6], Sub, CloneForBody, GEP, Add2, + CloneForBodyPreheader); + } else if (unroll_count == 16) { + auto [Add60, Add56, GEP] = modifyOuterLoop16(OuterLoop); + modifyInnerLoop(OuterLoop, BBsToMerge[14], Add60, CloneForBody, Add56, GEP, + unroll_count); + } else if (unroll_count == 8) { + auto [Add214, Add207, GEP] = modifyOuterLoop8(OuterLoop); + modifyInnerLoop(OuterLoop, BBsToMerge[6], Add214, CloneForBody, Add207, GEP, + unroll_count); + } + LLVM_DEBUG(F.dump()); +} + +static void modifyFirstCloneForBody(BasicBlock *CloneForBody, + PHINode *N_0_lcssa, + BasicBlock *ForBody27LrPh, + PHINode *CoeffPosLcssa, Value *Operand1) { + CloneForBody->getTerminator()->setSuccessor(1, CloneForBody); + for (PHINode &Phi : CloneForBody->phis()) { + Phi.setIncomingBlock(0, ForBody27LrPh); + Phi.setIncomingBlock(1, CloneForBody); + } + PHINode *FirstI32Phi = getFirstI32Phi(CloneForBody); + PHINode *LastI32Phi = getLastI32Phi(CloneForBody); + FirstI32Phi->setIncomingValue(0, N_0_lcssa); + FirstI32Phi->setIncomingBlock(0, ForBody27LrPh); + + Instruction *firstAddInst = nullptr; + Instruction *lastAddInst = nullptr; + for (Instruction &I : *CloneForBody) { + if (I.getOpcode() == Instruction::Add) { + if (!firstAddInst) { + firstAddInst = &I; + } + lastAddInst = &I; + } + } + ICmpInst *LastCmpInst = getLastICmpInst(CloneForBody); + LastCmpInst->setOperand(0, lastAddInst); + LastCmpInst->setOperand(1, Operand1); + FirstI32Phi->setIncomingValue(1, lastAddInst); + + LastI32Phi->setIncomingValue(0, CoeffPosLcssa); + LastI32Phi->setIncomingBlock(0, ForBody27LrPh); + + LastI32Phi->setIncomingValue(1, firstAddInst); +} + +static bool setBBFromOtherBB(Function &F, StringRef BBName, + BasicBlock *ForBodyMerged) { + // Find the first and last load instructions in ForBody27LrPh + LoadInst *FirstLoad = nullptr; + LoadInst *LastLoad = nullptr; + BasicBlock *ForBody27LrPh = getBasicBlockByName(F, BBName); + for (Instruction &I : *ForBody27LrPh) { + if (auto *LI = dyn_cast(&I)) { + if (!FirstLoad) { + FirstLoad = LI; + } + LastLoad = LI; + } + } + + assert(FirstLoad && LastLoad && "Find load instructions in ForBody27LrPh"); + + // modify getelementptr + // Traverse the GEP instructions in ForBodyMerged + std::vector GEPInsts; + for (Instruction &I : *ForBodyMerged) { + if (auto *GEP = dyn_cast(&I)) { + GEPInsts.push_back(GEP); + } + } + // Ensure there is at least one GEP instruction + if (!GEPInsts.empty()) { + for (size_t i = 0; i < GEPInsts.size(); ++i) { + GetElementPtrInst *CurrentGEP = GEPInsts[i]; + + if (i % 2 == 1) { // Odd + CurrentGEP->setOperand(0, LastLoad); + } else { // Even + CurrentGEP->setOperand(0, FirstLoad); + } + } + } + return true; +} + +// Function to modify the first loop in FIRD (Finite Impulse Response Design) +// transformation +static void modifyFirdFirstLoop(Function &F, Loop *L, BasicBlock *ForBodyMerged, + BasicBlock *CloneForBody) { + BasicBlock *ForCond23Preheader = + ForBodyMerged->getTerminator()->getSuccessor(0)->getSingleSuccessor(); + assert(ForCond23Preheader && + "ForCondPreheader should have single predecessor"); + + BasicBlock *ForCondCleanup3 = + getFirstI32Phi(ForCond23Preheader)->getIncomingBlock(0); + Instruction *FirstI32Phi = getFirstI32Phi(ForCondCleanup3); + + ICmpInst *LastICmp = getLastICmpInst(ForCondCleanup3); + // Create new add instruction + IRBuilder<> Builder(LastICmp); + Value *Add269 = Builder.CreateNSWAdd( + FirstI32Phi, ConstantInt::get(FirstI32Phi->getType(), 8), "add269"); + LastICmp->setOperand(0, Add269); + LastICmp->setPredicate(ICmpInst::ICMP_SGT); + swapTerminatorSuccessors(ForCondCleanup3); + + PHINode *N_069 = getFirstI32Phi(ForBodyMerged); + Value *Inc20_7 = N_069->getIncomingValue(1); + BasicBlock *ForBodyMergedLoopPreheader = N_069->getIncomingBlock(0); + // Create new phi node at the beginning of ForBodyMerged + PHINode *Add281 = PHINode::Create(Type::getInt32Ty(F.getContext()), 2, + "add281", &ForBodyMerged->front()); + + // Set incoming values for phi node + Add281->addIncoming(Add269, ForBodyMergedLoopPreheader); + Add281->addIncoming(Inc20_7, ForBodyMerged); + + N_069->setIncomingValue(1, Add281); + + ICmpInst *LastICmpInPreheader = getLastICmpInst(ForCond23Preheader); + // Create new phi node + PHINode *N_0_lcssa = PHINode::Create(Type::getInt32Ty(F.getContext()), 2, + "n.0.lcssa", LastICmpInPreheader); + + // Set incoming values for phi node + N_0_lcssa->addIncoming(FirstI32Phi, ForCondCleanup3); + N_0_lcssa->addIncoming(Add281, ForBodyMerged); + + // Replace operand of LastICmpInPreheader with new phi node + LastICmpInPreheader->setOperand(0, N_0_lcssa); + LastICmpInPreheader->setPredicate(ICmpInst::ICMP_SLT); + + Value *Operand1 = LastICmp->getOperand(1); + LastICmpInPreheader->setOperand(1, Operand1); + + // Get %coeff_pos.0.lcssa + PHINode *CoeffPosLcssa = getFirstI32Phi(ForCond23Preheader); + + // Insert new add instruction at the end of ForBodyMergedLoopPreheader + BasicBlock *ForBody27LrPh = + ForCond23Preheader->getTerminator()->getSuccessor(0); + Builder.SetInsertPoint(ForBody27LrPh->getTerminator()); + Value *Add11 = Builder.CreateAdd(Operand1, CoeffPosLcssa); + + ForBody27LrPh->getTerminator()->setSuccessor(0, CloneForBody); + ICmpInst *LastICmpInForBodyMerged = getLastICmpInst(ForBodyMerged); + LastICmpInForBodyMerged->setOperand(1, Operand1); + LastICmpInForBodyMerged->setOperand(0, Inc20_7); + + modifyFirstCloneForBody(CloneForBody, N_0_lcssa, ForBody27LrPh, CoeffPosLcssa, + Operand1); + + PHINode *acc_0_lcssa = getFirstFloatPhi(ForCond23Preheader); + BasicBlock *ForCond23PreheaderLoopExit = acc_0_lcssa->getIncomingBlock(1); + PHINode *_lcssa = getFirstFloatPhi(ForCond23PreheaderLoopExit); + acc_0_lcssa->setIncomingValue(1, _lcssa->getIncomingValue(0)); + acc_0_lcssa->setIncomingBlock(1, _lcssa->getIncomingBlock(0)); + + Value *floatZero = acc_0_lcssa->getIncomingValue(0); + + // Get all incoming values and blocks for PHINode + for (unsigned i = 1; i < _lcssa->getNumIncomingValues(); ++i) { + Value *IncomingValue = _lcssa->getIncomingValue(i); + BasicBlock *IncomingBlock = _lcssa->getIncomingBlock(i); + + // Create new phi node in ForCond23Preheader + PHINode *NewPhi = + PHINode::Create(floatZero->getType(), 2, + "acc." + std::to_string(i) + ".lcssa", CoeffPosLcssa); + // Add incoming values + NewPhi->addIncoming(floatZero, ForCondCleanup3); + NewPhi->addIncoming(IncomingValue, IncomingBlock); + } + Value *coeff_pos_068 = getLastI32Phi(ForBodyMerged)->getIncomingValue(1); + CoeffPosLcssa->setIncomingValue(1, coeff_pos_068); + + getLastFloatPhi(CloneForBody)->setIncomingValue(0, acc_0_lcssa); + + BasicBlock *PredBB = ForBodyMerged->getSinglePredecessor(); + if (!PredBB) { + // If no single predecessor, iterate through all predecessors + for (BasicBlock *Pred : predecessors(ForBodyMerged)) { + PredBB = Pred; + break; // Only take first predecessor + } + } + SmallVector FMulAddCalls; + // insertPhiNodesForFMulAdd(ForBodyMerged, ForCond23PreHeader, FMulAddCalls); + // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader + for (Instruction &I : *ForBodyMerged) { + if (CallInst *CI = dyn_cast(&I)) { + if (Function *F = CI->getCalledFunction()) { + if (F->getName() == "llvm.fmuladd.f32" && CI->isTailCall()) { + FMulAddCalls.push_back(CI); + } + } + } + } + + // Insert phi nodes for each FMulAdd call + for (CallInst *CI : FMulAddCalls) { + // Create new phi node + PHINode *PHI = PHINode::Create(CI->getType(), 2, CI->getName() + "acc", CI); + + // Set incoming values for phi node + PHI->addIncoming(ConstantFP::get(CI->getType(), 0), PredBB); + PHI->addIncoming(CI, ForBodyMerged); + + CI->setOperand(2, PHI); + } + movePHINodesToTop(*ForBodyMerged); + modifyAddToOr(ForBodyMerged); + + ICmpInst *LastICmpForBodyMerged = getLastICmpInst(ForBodyMerged); + LastICmpForBodyMerged->setPredicate(ICmpInst::ICMP_SGT); + cast(LastICmpForBodyMerged->getOperand(0)) + ->setOperand(0, getFirstI32Phi(ForBodyMerged)); + + // Find first and last load instructions in ForBody14LrPh + LoadInst *FirstLoad = nullptr; + LoadInst *LastLoad = nullptr; + BasicBlock *ForBody14LrPh = getBasicBlockByName(F, "for.body14.lr.ph"); + for (Instruction &I : *ForBody14LrPh) { + if (auto *LI = dyn_cast(&I)) { + if (!FirstLoad) { + FirstLoad = LI; + } + LastLoad = LI; + } + } + + assert(FirstLoad && LastLoad && + "Failed to find load instructions in ForBody14LrPh"); + + // modify getelementptr + // Iterate through getelementptr instructions in ForBodyMerged + std::vector GEPInsts; + for (Instruction &I : *ForBodyMerged) { + if (auto *GEP = dyn_cast(&I)) { + GEPInsts.push_back(GEP); + } + } + // Ensure at least one getelementptr instruction exists + if (!GEPInsts.empty()) { + for (size_t i = 0; i < GEPInsts.size(); ++i) { + GetElementPtrInst *CurrentGEP = GEPInsts[i]; + + if (i % 2 == 1) { // Odd + CurrentGEP->setOperand(0, LastLoad); + } else { // Even + CurrentGEP->setOperand(0, FirstLoad); + } + } + } + + // Ensure at least one getelementptr instruction exists + if (!GEPInsts.empty()) { + // Get first getelementptr instruction + GetElementPtrInst *SecondGEP = GEPInsts[1]; + + // Starting from index 1, process every other getelementptr + for (size_t i = 3; i < GEPInsts.size(); i += 2) { + GetElementPtrInst *CurrentGEP = GEPInsts[i]; + + // Set current getelementptr's operand 0 to first getelementptr's value + CurrentGEP->setOperand(0, SecondGEP); + + // Set operand 1 to current index value + // ConstantInt *IndexValue = + // ConstantInt::get(CurrentGEP->getOperand(1)->getType(), i); + CurrentGEP->setOperand( + 1, ConstantInt::get(CurrentGEP->getOperand(1)->getType(), (i) / 2)); + } + } + + setBBFromOtherBB(F, "for.body27.lr.ph", CloneForBody); + + BasicBlock *ForCondCleanup26LoopExit = CloneForBody->getNextNode(); + BasicBlock *ForCondCleanup26 = ForCondCleanup26LoopExit->getSingleSuccessor(); + Instruction *tailcallInst = + getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32"); + + // Find add instruction in ForBody27LrPh + Instruction *AddInst = nullptr; + for (Instruction &I : *ForBody27LrPh) { + if (I.getOpcode() == Instruction::Add) { + AddInst = &I; + break; + } + } + + // Insert new instructions in ForCondCleanup26LoopExit + Builder.SetInsertPoint(ForCondCleanup26LoopExit->getFirstNonPHI()); + Value *SubResult = Builder.CreateSub(AddInst, N_0_lcssa); + PHINode *firstFloatPhi = getFirstFloatPhi(ForCondCleanup26); + firstFloatPhi->setIncomingValue(1, tailcallInst); + + ForCond23Preheader->setName("for.cond63.preheader"); + // Create new PHI node in ForCondCleanup26 + PHINode *CoeffPosLcssaPhi = + PHINode::Create(CoeffPosLcssa->getType(), 2, "coeff_pos.1.lcssa", + &ForCondCleanup26->front()); + + // Set incoming values and blocks for PHI node + CoeffPosLcssaPhi->addIncoming(CoeffPosLcssa, ForCond23Preheader); + CoeffPosLcssaPhi->addIncoming(SubResult, ForCondCleanup26LoopExit); + // eraseAllStoreInstInBB(ForCondCleanup26); + + ICmpInst *LastICmpForCondCleanup26 = getLastICmpInst(ForCondCleanup26); + + LastICmpForCondCleanup26->setPredicate(ICmpInst::ICMP_SLT); + PHINode *FirstI32ForCondCleanup3 = getFirstI32Phi(ForCondCleanup3); + LastICmpForCondCleanup26->setOperand(0, FirstI32ForCondCleanup3); + LastICmpForCondCleanup26->setOperand( + 1, + ConstantInt::get(LastICmpForCondCleanup26->getOperand(1)->getType(), 8)); + + BasicBlock *ForBody79LrPh = + cloneBasicBlockWithRelations(ForBody27LrPh, ".clone", &F); + ForBody79LrPh->setName("for.body79.lr.ph"); + ForBody79LrPh->moveBefore(CloneForBody); + ForBody79LrPh->getTerminator()->setSuccessor(0, ForBodyMerged); + ForCondCleanup26->getTerminator()->setSuccessor(1, ForBody79LrPh); + // Create new and instruction in ForBody79LrPh + Builder.SetInsertPoint(ForBody79LrPh->getTerminator()); + Value *AndResult = Builder.CreateAnd( + FirstI32ForCondCleanup3, + ConstantInt::get(FirstI32ForCondCleanup3->getType(), 2147483640)); + + BasicBlock *ForCond130Preheader = + cloneBasicBlockWithRelations(ForCond23Preheader, ".clone", &F); + ForCond130Preheader->setName("for.cond130.preheader"); + ForCond130Preheader->moveAfter(CloneForBody); + ForCondCleanup26->getTerminator()->setSuccessor(0, ForCond130Preheader); + for (PHINode &Phi : ForCond130Preheader->phis()) { + Phi.setIncomingBlock(0, ForCondCleanup26); + } + // Iterate through phi nodes in ForCond130Preheader and ForCond23Preheader + // simultaneously + auto it130 = ForCond130Preheader->begin(); + auto it23 = ForCond23Preheader->begin(); + + while (it130 != ForCond130Preheader->end() && + it23 != ForCond23Preheader->end()) { + if (auto *phi130 = dyn_cast(&*it130)) { + if (auto *phi23 = dyn_cast(&*it23)) { + if (phi130->getType()->isFloatTy() && phi23->getType()->isFloatTy()) { + // Write phi float from ForCond23Preheader to incomingvalue 0 position + // in ForCond130Preheader + phi130->setIncomingValue(0, phi23); + } + } + ++it23; + } + ++it130; + } + getFirstFloatPhi(ForCond130Preheader)->setIncomingValue(0, firstFloatPhi); + + getFirstI32Phi(ForCond130Preheader) + ->setIncomingValue(0, getFirstI32Phi(ForCondCleanup26)); + + PHINode *LastI32Phi130 = getLastI32Phi(ForCond130Preheader); + LastI32Phi130->setIncomingValue( + 0, ConstantInt::get(getLastI32Phi(ForCond130Preheader)->getType(), 0)); + LastI32Phi130->setIncomingValue(1, AndResult); + + ICmpInst *LastICmp130 = getLastICmpInst(ForCond130Preheader); + LastICmp130->setOperand(1, FirstI32ForCondCleanup3); + + PHINode *LastI32PhiClone = getLastFloatPhi(CloneForBody); + LastI32PhiClone->setIncomingValue(1, tailcallInst); + + // modify for.cond23.preheader.loopexit + // modify for.cond63.preheader + for (PHINode &Phi : ForCond23Preheader->phis()) { + Phi.setIncomingBlock(1, ForBodyMerged); + } + ForBodyMerged->getTerminator()->setSuccessor(0, ForCond130Preheader); + + CloneForBody->getTerminator()->setSuccessor(0, ForCondCleanup26LoopExit); + + // Get for.cond.cleanup.loopexit basic block + BasicBlock *ForCondCleanupLoopExit = + getBasicBlockByName(F, "for.cond23.preheader.loopexit"); + + // Check if for.cond.cleanup.loopexit exists + if (ForCondCleanupLoopExit) { + // Check if for.cond.cleanup.loopexit has no predecessors + if (pred_empty(ForCondCleanupLoopExit)) { + // Delete for.cond.cleanup.loopexit basic block + ForCondCleanupLoopExit->eraseFromParent(); + } + } + + ForBodyMerged->getTerminator()->setSuccessor(0, ForCond23Preheader); +} + +static bool copyFloatPhiIncomingValue(int i, BasicBlock *srcBB, + BasicBlock *tarBB) { + assert(srcBB && tarBB && "srcBB or tarBB should not be nullptr"); + // Collect phi float nodes from ForCond130Preheader in reverse order into + // vector + SmallVector floatPhis; + + for (auto it = srcBB->rbegin(); it != srcBB->rend(); ++it) { + if (PHINode *phi = dyn_cast(&*it)) { + if (phi->getType()->isFloatTy()) { + floatPhis.push_back(phi->getIncomingValue(i)); + } + } + } + + // Traverse phi float nodes in ForBodyMerged in reverse order and store values + // from floatPhis into their incoming value 0 + auto floatPhiIt = floatPhis.begin(); + for (auto it = tarBB->rbegin(); + it != tarBB->rend() && floatPhiIt != floatPhis.end(); ++it) { + if (PHINode *phi = dyn_cast(&*it)) { + if (phi->getType()->isFloatTy()) { + phi->setIncomingValue(i, *floatPhiIt); + ++floatPhiIt; + } + } + } + return true; +} + +static void modifyFirdSecondLoop(Function &F, Loop *L, + BasicBlock *ForBodyMerged, + BasicBlock *CloneForBody) { + BasicBlock *ForBody = L->getHeader(); + + BasicBlock *ForBody133LrPh = + BasicBlock::Create(CloneForBody->getContext(), "for.body133.lr.ph", + CloneForBody->getParent(), CloneForBody); + + updatePredecessorsToPreheader(CloneForBody, ForBody133LrPh); + + BasicBlock *PredBB = ForBodyMerged->getSinglePredecessor(); + if (!PredBB) { + // If there is no single predecessor, iterate through all predecessors + for (BasicBlock *Pred : predecessors(ForBodyMerged)) { + PredBB = Pred; + break; // Only take the first predecessor + } + } + SmallVector FMulAddCalls; + // Collect all tail call float @llvm.fmuladd.f32 in LoopHeader + for (Instruction &I : *ForBodyMerged) { + if (CallInst *CI = dyn_cast(&I)) { + if (Function *F = CI->getCalledFunction()) { + if (F->getName() == "llvm.fmuladd.f32" && CI->isTailCall()) { + FMulAddCalls.push_back(CI); + } + } + } + } + + // Insert phi nodes for each FMulAdd call + for (CallInst *CI : FMulAddCalls) { + // Create new phi node + PHINode *PHI = PHINode::Create(CI->getType(), 2, CI->getName() + "acc", CI); + + // Set incoming values for phi node + PHI->addIncoming(ConstantFP::get(CI->getType(), 0), PredBB); + PHI->addIncoming(CI, ForBodyMerged); + + CI->setOperand(2, PHI); + } + PHINode *n22_075 = getFirstI32Phi(ForBodyMerged); + // Create new phi node in ForBodyMerged + PHINode *Add76310 = PHINode::Create(Type::getInt32Ty(F.getContext()), 2, + "add76310", &ForBodyMerged->front()); + Add76310->addIncoming(ConstantInt::get(Type::getInt32Ty(F.getContext()), 8), + ForBody133LrPh); + n22_075->setIncomingValue(1, Add76310); + // Create new add instruction in ForBodyMerged + IRBuilder<> Builder(ForBodyMerged->getTerminator()); + Value *Add76 = Builder.CreateAdd( + Add76310, ConstantInt::get(Type::getInt32Ty(F.getContext()), 8), "add76", + true, true); + + // Update phi node's loop edge + Add76310->addIncoming(Add76, ForBodyMerged); + + movePHINodesToTop(*ForBodyMerged); + modifyAddToOr(ForBodyMerged); + + ICmpInst *LastICmp = getLastICmpInst(ForBodyMerged); + LastICmp->setPredicate(ICmpInst::ICMP_SGT); + cast(Add76)->moveBefore(LastICmp); + LastICmp->setOperand(0, Add76); + for (PHINode &Phi : ForBodyMerged->phis()) { + Phi.setIncomingBlock(0, PredBB); + } + + BasicBlock *NewForEnd141 = + BasicBlock::Create(F.getContext(), "for.end141", &F, CloneForBody); + NewForEnd141->moveAfter(CloneForBody); + + BasicBlock *ForCond1Preheader = getBasicBlockByName(F, "for.cond1.preheader"); + for (PHINode &Phi : ForCond1Preheader->phis()) { + Phi.setIncomingBlock(1, NewForEnd141); + } + PHINode *ForCond1PreheaderLastI32Phi = getLastI32Phi(ForCond1Preheader); + // Insert new add instruction in NewForEnd141 + Builder.SetInsertPoint(NewForEnd141); + Value *Inc152 = + Builder.CreateAdd(ForCond1PreheaderLastI32Phi, + ConstantInt::get(Type::getInt32Ty(F.getContext()), 1), + "inc152", true, true); + Inc152->setName("inc152"); + + // Update PHI nodes in ForCond1Preheader + ForCond1PreheaderLastI32Phi->setIncomingValue(1, Inc152); + + BasicBlock *ForCondCleanup = getBasicBlockByName(F, "for.cond.cleanup"); + getFirstI32Phi(ForCondCleanup)->setIncomingBlock(1, NewForEnd141); + + // Find len parameter in function F + Value *LenArg = getLenFromEntryBlock(F); + assert(LenArg && "LenArg should be"); + + // Create comparison instruction + Value *ExitCond350 = Builder.CreateICmpEQ(Inc152, LenArg, "exitcond350.not"); + + // Create conditional branch instruction + Builder.CreateCondBr(ExitCond350, ForCondCleanup, ForCond1Preheader); + + BasicBlock *ForCond130Preheader = + getBasicBlockByName(F, "for.cond130.preheader"); + for (PHINode &phi : ForCond130Preheader->phis()) { + phi.setIncomingBlock(1, ForBodyMerged); + } + ForCond130Preheader->getTerminator()->setSuccessor(0, ForBody133LrPh); + ForCond130Preheader->getTerminator()->setSuccessor(1, NewForEnd141); + + // ForBody133LrPh + // Create new instructions in ForBody133LrPh + BasicBlock *ForBody79LrPh = getBasicBlockByName(F, "for.body79.lr.ph"); + ForBody79LrPh->getTerminator()->setSuccessor(0, ForBodyMerged); + // Copy loadinst from ForBody79LrPh to ForBody133LrPh + Builder.SetInsertPoint(ForBody133LrPh->getTerminator()); + for (Instruction &I : *ForBody79LrPh) { + if (isa(I)) { + Instruction *ClonedInst = I.clone(); + ClonedInst->setName(I.getName()); + Builder.Insert(ClonedInst); + } + } + + // modify ForBodyMerged + for (PHINode &Phi : ForBodyMerged->phis()) { + Phi.setIncomingBlock(0, ForBody79LrPh); + } + + PHINode *coeff_pos174 = getLastI32Phi(ForBodyMerged); + PHINode *coeff_pos_0_lcssa_clone = getFirstI32Phi(ForCond130Preheader); + coeff_pos_0_lcssa_clone->setIncomingValue(1, + coeff_pos174->getIncomingValue(1)); + coeff_pos174->setIncomingValue(0, + coeff_pos_0_lcssa_clone->getIncomingValue(0)); + + bool res = copyFloatPhiIncomingValue(0, ForCond130Preheader, ForBodyMerged); + assert(res && "copyFloatPhiIncomingZeroValue failed"); + + bool res1 = copyFloatPhiIncomingValue(1, ForBodyMerged, ForCond130Preheader); + assert(res1 && "copyFloatPhiIncomingValue failed"); + // Find first and last load instructions in ForBody79LrPh + LoadInst *FirstLoad = nullptr; + LoadInst *LastLoad = nullptr; + + for (Instruction &I : *ForBody79LrPh) { + if (auto *LI = dyn_cast(&I)) { + if (!FirstLoad) { + FirstLoad = LI; + } + LastLoad = LI; + } + } + + assert(FirstLoad && LastLoad && + "Could not find load instructions in ForBody79LrPh"); + // Iterate through GetElementPtrInst + std::vector GEPInsts; + for (Instruction &I : *ForBodyMerged) { + if (auto *GEP = dyn_cast(&I)) { + GEPInsts.push_back(GEP); + } + } + + // Ensure there is at least one getelementptr instruction + if (!GEPInsts.empty()) { + for (size_t i = 0; i < GEPInsts.size(); ++i) { + GetElementPtrInst *CurrentGEP = GEPInsts[i]; + + if (i % 2 == 1) { // odd + CurrentGEP->setOperand(0, LastLoad); + } else { // even + CurrentGEP->setOperand(0, FirstLoad); + } + } + } + + // Ensure there is at least one getelementptr instruction + if (!GEPInsts.empty()) { + // Get first getelementptr instruction + GetElementPtrInst *FirstGEP = GEPInsts[0]; + + // Starting from index 1, process every other getelementptr + for (size_t i = 2; i < GEPInsts.size(); i += 2) { + GetElementPtrInst *CurrentGEP = GEPInsts[i]; + + // Set current getelementptr's operand 0 to first getelementptr's value + CurrentGEP->setOperand(0, FirstGEP); + + // Set operand 1 to current index value + CurrentGEP->setOperand( + 1, ConstantInt::get(CurrentGEP->getOperand(1)->getType(), (i) / 2)); + } + } + + ForBodyMerged->getTerminator()->setSuccessor(0, ForCond130Preheader); + + // modify for.body27.clone + PHINode *n_0_lcssa_clone = getLastI32Phi(ForCond130Preheader); + PHINode *acc_0_lcssa_clone = getFirstFloatPhi(ForCond130Preheader); + Instruction *tailcallInst = + getFirstCallInstWithName(CloneForBody, "llvm.fmuladd.f32"); + Instruction *firstAddInst = nullptr; + Instruction *lastAddInst = nullptr; + for (Instruction &I : *CloneForBody) { + if (I.getOpcode() == Instruction::Add) { + if (!firstAddInst) { + firstAddInst = &I; + } + lastAddInst = &I; + } + } + int index = 0; + for (PHINode &Phi : CloneForBody->phis()) { + Phi.setIncomingBlock(0, ForBody133LrPh); + Phi.setIncomingBlock(1, CloneForBody); + if (index == 0) { + Phi.setIncomingValue(0, n_0_lcssa_clone); + Phi.setIncomingValue(1, lastAddInst); + } else if (index == 1) { + Phi.setIncomingValue(0, coeff_pos_0_lcssa_clone); + Phi.setIncomingValue(1, firstAddInst); + } else if (index == 2) { + Phi.setIncomingValue(0, acc_0_lcssa_clone); + Phi.setIncomingValue(1, tailcallInst); + } + index++; + } + + CloneForBody->getTerminator()->setSuccessor(0, NewForEnd141); + CloneForBody->getTerminator()->setSuccessor(1, CloneForBody); + + // modify for.end141 + // Create phi float node in NewForEnd141 + PHINode *AccPhi = PHINode::Create(Type::getFloatTy(F.getContext()), 2, + "acc0.3.lcssa", &NewForEnd141->front()); + AccPhi->addIncoming(acc_0_lcssa_clone, ForCond130Preheader); + AccPhi->addIncoming(tailcallInst, CloneForBody); + + int i = 0; + Value *Sum = nullptr; + Instruction *insertPoint = AccPhi->getNextNode(); + // Count the number of float type phi nodes in ForCond130Preheader + SmallVector floatPhis; + for (PHINode &phi : ForCond130Preheader->phis()) { + if (phi.getType()->isFloatTy()) { + floatPhis.push_back(&phi); + } + } + assert(floatPhis.size() == 8 && + "Expected 8 float phi nodes in ForCond130Preheader"); + // Create parallel add instructions for better performance + Value *Add60 = + BinaryOperator::CreateFAdd(floatPhis[1], AccPhi, "add60", insertPoint); + Value *Add61 = BinaryOperator::CreateFAdd(floatPhis[2], floatPhis[3], "add61", + insertPoint); + Value *Add62 = BinaryOperator::CreateFAdd(floatPhis[4], floatPhis[5], "add62", + insertPoint); + Value *Add63 = BinaryOperator::CreateFAdd(floatPhis[6], floatPhis[7], "add63", + insertPoint); + Value *Add64 = BinaryOperator::CreateFAdd(Add60, Add61, "add64", insertPoint); + Value *Add65 = BinaryOperator::CreateFAdd(Add62, Add63, "add65", insertPoint); + Value *Add66 = BinaryOperator::CreateFAdd(Add64, Add65, "add66", insertPoint); + Sum = Add66; + + // Move getelementptr and store instructions from for.cond.cleanup26 to + // NewForEnd141 + BasicBlock *ForCondCleanup26 = getBasicBlockByName(F, "for.cond.cleanup26"); + + SmallVector instructionsToMove; + + // Collect instructions to move + for (Instruction &I : *ForCondCleanup26) { + if (isa(I) || isa(I)) { + instructionsToMove.push_back(&I); + } + } + + // Move instructions + for (Instruction *I : instructionsToMove) { + I->moveBefore(insertPoint); + if (isa(I)) { + I->setOperand(0, Sum); + } + } + + // Update instructions that used moved instructions + for (Instruction &I : *NewForEnd141) { + I.replaceUsesOfWith(ForCondCleanup26, NewForEnd141); + } + + // Get for.cond.cleanup.loopexit basic block + BasicBlock *ForCondCleanupLoopExit = + getBasicBlockByName(F, "for.cond.cleanup.loopexit"); + + // Check if for.cond.cleanup.loopexit exists + if (ForCondCleanupLoopExit) { + // Check if for.cond.cleanup.loopexit has no predecessors + if (pred_empty(ForCondCleanupLoopExit)) { + // Delete for.cond.cleanup.loopexit basic block + ForCondCleanupLoopExit->eraseFromParent(); + } + } + + setBBFromOtherBB(F, "for.body133.lr.ph", CloneForBody); +} + +// Main function to perform FIRD unrolling +static void PostUnrollFird(Function &F, Loop *L, int loop_index) { + BasicBlock *ForBody = L->getHeader(); + BasicBlock *CloneForBody = + cloneBasicBlockWithRelations(ForBody, ".clone", &F); + CloneForBody->moveAfter(ForBody); + CloneForBody->getTerminator()->setSuccessor(1, ForBody); + + // Merge basic blocks + std::vector BBsToMerge; + for (int i = 1; i < 8; ++i) { + std::string BBName = (ForBody->getName() + "." + std::to_string(i)).str(); + BasicBlock *ForBodyClone = getBasicBlockByName(F, BBName); + if (ForBodyClone) { + BBsToMerge.push_back(ForBodyClone); + } else { + llvm_unreachable("can't find ForBodyClone"); + } + } + if (BBsToMerge.size() == 7) { + for (BasicBlock *BB : BBsToMerge) { + MergeBasicBlockIntoOnlyPred(BB); + } + } + BasicBlock *ForBodyMerged = BBsToMerge[6]; + CloneForBody->moveAfter(ForBodyMerged); + + // Perform loop-specific modifications + if (loop_index == 1) { + modifyFirdFirstLoop(F, L, ForBodyMerged, CloneForBody); + } else if (loop_index == 2) { + modifyFirdSecondLoop(F, L, ForBodyMerged, CloneForBody); + } +} + +// Helper function to check if a loop is simple (single-level, innermost, and +// outermost) +static bool isSimpleLoop(const Loop *L) { + return L->getLoopDepth() == 1 && L->isInnermost() && L->isOutermost(); +} + +// Handle simple loops +static bool handleSimpleLoop(Function &F, Loop *L, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree &DT, + AssumptionCache &AC, + const TargetTransformInfo &TTI, + OptimizationRemarkEmitter &ORE) { + if (shouldUnrollLoopWithCount(F, L, SE)) { + LLVM_DEBUG(errs() << "Unrolling loop with count\n"); + auto UnrollResult = + UnrollLoop(L, + {/*Count*/ 8, /*Force*/ true, /*Runtime*/ false, + /*AllowExpensiveTripCount*/ true, + /*UnrollRemainder*/ true, true}, + LI, &SE, &DT, &AC, &TTI, /*ORE*/ &ORE, true); + postUnrollLoopWithCount(F, L, 8); + return true; + } + + if (shouldUnrollComplexLoop(F, L, SE, DT, *LI)) { + LLVM_DEBUG(errs() << "Unrolling complex loop\n"); + auto UnrollResult = + UnrollLoop(L, + {/*Count*/ 8, /*Force*/ true, /*Runtime*/ false, + /*AllowExpensiveTripCount*/ true, + /*UnrollRemainder*/ true, true}, + LI, &SE, &DT, &AC, &TTI, /*ORE*/ &ORE, true); + postUnrollLoopWithVariable(F, L, 8); + return true; + } + + if (shouldUnrollAddcType(F, LI)) { + LLVM_DEBUG(errs() << "Unrolling ADDC type loop\n"); + unrollAddc(F, SE, L, 16); + currentUnrollType = UnrollType::ADD_ADDC_SUB_MUL_MULC_SQRT; + return true; + } + + if (shouldUnrollDotprodType(F, LI)) { + LLVM_DEBUG(errs() << "Transforming dot product type loop\n"); + currentUnrollType = UnrollType::DOTPROD; + transformOneLoopDepth(F); + return true; + } + + LLVM_DEBUG(errs() << "No unrolling performed for this loop\n"); + return false; +} + +// Helper function to simplify loop and form LCSSA +static void simplifyAndFormLCSSA(Loop *L, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution &SE, AssumptionCache &AC) { + simplifyLoop(L, &DT, LI, &SE, &AC, nullptr, false); + formLCSSARecursively(*L, DT, LI, &SE); +} + +// Helper function to get CONV unroll factor +static unsigned int getConvUnrollFactor(uint32_t unrollCount) { + static const unsigned int unrollFactors[] = {8, 16, 8}; + return unrollFactors[unrollCount % 3]; +} + +// Handle CONV type unrolling +static bool handleConvUnroll(Function &F, Loop *L, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree &DT, + AssumptionCache &AC, + const TargetTransformInfo &TTI, + OptimizationRemarkEmitter &ORE, + uint32_t &unrollCount) { + LLVM_DEBUG(errs() << "Unrolling CONV type loop\n"); + currentUnrollType = UnrollType::CONV_CCORR; + + unsigned int unrollFactor = getConvUnrollFactor(unrollCount); + simplifyAndFormLCSSA(L, DT, LI, SE, AC); + + auto UnrollResult = + UnrollLoop(L, {unrollFactor, true, false, true, true, true}, LI, &SE, &DT, + &AC, &TTI, &ORE, true); + + unrollCount++; + return true; +} + +// Handle FIRD type unrolling +static bool handleFirdUnroll(Function &F, Loop *L, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree &DT, + AssumptionCache &AC, + const TargetTransformInfo &TTI, + OptimizationRemarkEmitter &ORE, + uint32_t &unroll_times) { + LLVM_DEBUG(errs() << "Unrolling FIRD type loop\n"); + currentUnrollType = UnrollType::FIRD; + + if (unroll_times == 0) { + unroll_times++; + return false; + } + + simplifyAndFormLCSSA(L, DT, LI, SE, AC); + + auto UnrollResult = UnrollLoop(L, {8, true, false, true, true, true}, LI, &SE, + &DT, &AC, &TTI, &ORE, false); + + return true; +} + +// Handle innermost loops +static bool handleInnermostLoop(Function &F, Loop *L, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree &DT, + AssumptionCache &AC, + const TargetTransformInfo &TTI, + OptimizationRemarkEmitter &ORE, + uint32_t &unrollCount) { + if (shouldUnrollCorr(F, LI)) { + LLVM_DEBUG(errs() << "Unrolling correlation type loop\n"); + unrollCorr(F, L, 16); + currentUnrollType = UnrollType::CORR; + return true; + } + + if (shouldUnrollFirType(F, LI) || currentUnrollType == UnrollType::FIR) { + LLVM_DEBUG(errs() << "Transforming FIR type loop\n"); + unrollFir(F, L); + currentUnrollType = UnrollType::FIR; + return true; + } + + if (shouldUnrollConvccorr(F, LI) || + currentUnrollType == UnrollType::CONV_CCORR) { + return handleConvUnroll(F, L, SE, LI, DT, AC, TTI, ORE, unrollCount); + } + + if (shouldUnrollFird(F, LI) || currentUnrollType == UnrollType::FIRD) { + return handleFirdUnroll(F, L, SE, LI, DT, AC, TTI, ORE, unrollCount); + } + + LLVM_DEBUG(errs() << "No unrolling performed for this innermost loop\n"); + return false; +} + +// Check if unrolling should be disabled +static bool shouldDisableUnroll(const Loop *L) { + TransformationMode TM = hasUnrollTransformation(L); + return (TM & TM_Disable) != 0; +} + +static LoopUnrollResult +tryToUnrollLoop(Function &F, Loop *L, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution &SE, const TargetTransformInfo &TTI, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) { + // Initialize variables + bool changed = false; + static uint32_t unrollCount = 0; + // Handle single-level loops + if (isSimpleLoop(L)) { + changed = handleSimpleLoop(F, L, SE, LI, DT, AC, TTI, ORE); + } + // Handle innermost loops + else if (L->isInnermost()) { + changed = handleInnermostLoop(F, L, SE, LI, DT, AC, TTI, ORE, unrollCount); + } + + // Check if unrolling should be disabled + if (shouldDisableUnroll(L)) { + return LoopUnrollResult::Unmodified; + } + + return changed ? LoopUnrollResult::PartiallyUnrolled + : LoopUnrollResult::Unmodified; +} + +// Helper function to process CONV unroll type +void processConvUnroll(Function &F, const SmallVector &InnerLoops) { + static const int unroll_counts[] = {8, 16, 8}; + static int unroll_index = 0; + for (auto *L : InnerLoops) { + PostUnrollConv(F, L, unroll_counts[unroll_index], unroll_index); + unroll_index = (unroll_index + 1) % 3; + } +} + +// Helper function to process FIRD unroll type +void processFirdUnroll(Function &F, const SmallVector &InnerLoops) { + static int loop_index = 0; + for (auto &L : InnerLoops) { + if (loop_index == 0) { + loop_index++; + continue; + } + PostUnrollFird(F, L, loop_index); + loop_index++; + } +} + +static void addCommonOptimizationPasses(Function &F) { + // Create necessary analysis managers + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + + // Create pass builder + PassBuilder PB; + + // Register analyses + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + // Create function-level optimization pipeline + FunctionPassManager FPM; + + if (currentUnrollType == UnrollType::CORR || + currentUnrollType == UnrollType::FIRD) + FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass())); + FPM.addPass(EarlyCSEPass(true)); + FPM.addPass(ReassociatePass()); + + FPM.run(F, FAM); +} + +static void addLegacyCommonOptimizationPasses(Function &F) { + legacy::FunctionPassManager FPM(F.getParent()); + FPM.add(createLoopSimplifyPass()); + FPM.add(createLICMPass()); // Loop Invariant Code Motion + + // Add SimplifyCFG pass with common options + FPM.add(createCFGSimplificationPass( + SimplifyCFGOptions() + .bonusInstThreshold(1) // Set instruction bonus threshold + .forwardSwitchCondToPhi( + true) // Allow forwarding switch conditions to phi + .convertSwitchToLookupTable( + true) // Allow converting switch to lookup table + .needCanonicalLoops(false) // Don't require canonical loop form + .hoistCommonInsts(true) // Hoist common instructions + .sinkCommonInsts(true) // Sink common instructions + )); + + // Initialize and run passes + FPM.doInitialization(); + FPM.run(F); + FPM.doFinalization(); +} + +PreservedAnalyses +RISCVLoopUnrollAndRemainderPass::run(Function &F, FunctionAnalysisManager &AM) { + if (!EnableRISCVLoopUnrollAndRemainder || F.arg_empty()) + return PreservedAnalyses::all(); + + addnoalias(F); + auto &LI = AM.getResult(F); + if (LI.empty()) + return PreservedAnalyses::all(); + + // Retrieve necessary analysis results + auto &SE = AM.getResult(F); + auto &TTI = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &AC = AM.getResult(F); + auto &ORE = AM.getResult(F); + + LoopAnalysisManager *LAM = nullptr; + if (auto *LAMProxy = AM.getCachedResult(F)) + LAM = &LAMProxy->getManager(); + + auto &MAMProxy = AM.getResult(F); + ProfileSummaryInfo *PSI = + MAMProxy.getCachedResult(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) + ? &AM.getResult(F) + : nullptr; + + bool Changed = false; + + // Process loops in reverse order of LoopInfo + SmallPriorityWorklist Worklist; + appendLoopsToWorklist(LI, Worklist); + SmallVector InnerLoops; + + while (!Worklist.empty()) { + Loop &L = *Worklist.pop_back_val(); + if (L.getBlocks().empty()) { + LLVM_DEBUG(errs() << "Skipping empty loop\n"); + continue; + } + + std::string LoopName = std::string(L.getName()); + if (L.getName().contains(".clone")) + continue; + + if (L.isInnermost()) { + InnerLoops.push_back(&L); + } + + LoopUnrollResult Result = + tryToUnrollLoop(F, &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI); + Changed |= Result != LoopUnrollResult::Unmodified; + + // Clear cached analysis results if loop was fully unrolled + if (LAM && Result == LoopUnrollResult::FullyUnrolled) + LAM->clear(L, LoopName); + } + + // Post-processing for specific unroll types + if (currentUnrollType == UnrollType::CONV_CCORR) { + processConvUnroll(F, InnerLoops); + } else if (currentUnrollType == UnrollType::FIRD) { + processFirdUnroll(F, InnerLoops); + } + + // Run dead code elimination + runDeadCodeElimination(F); + if (currentUnrollType != UnrollType::FIR) + addCommonOptimizationPasses(F); + if (currentUnrollType == UnrollType::FIRD) { + addLegacyCommonOptimizationPasses(F); + } + // Verify function + if (verifyFunction(F, &errs())) { + LLVM_DEBUG(errs() << "Function verification failed\n"); + report_fatal_error("Function verification failed"); + } + + return Changed ? getLoopPassPreservedAnalyses() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h new file mode 100644 index 00000000000000..9e941cae210ad1 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVLoopUnrollAndRemainder.h @@ -0,0 +1,42 @@ +//===- RISCVLoopUnrollAndRemainder.h - Loop Unrolling and Remainder Handling +//------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RISCVLoopUnrollAndRemainder pass +// +// This pass performs loop unrolling and handles the remainder iterations. +// It aims to improve performance by: +// 1. Unrolling loops to reduce loop overhead and enable further optimizations +// 2. Generating efficient code for handling any remaining iterations +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_RISCVLOOPUNROLLANDREMAINDER_H +#define LLVM_TRANSFORMS_UTILS_RISCVLOOPUNROLLANDREMAINDER_H + +#include "llvm/Analysis/IVDescriptors.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +class RecurrenceDescriptor; +extern cl::opt EnableRISCVLoopUnrollAndRemainder; +class Function; + +struct RISCVLoopUnrollAndRemainderPass + : public PassInfoMixin { + RISCVLoopUnrollAndRemainderPass() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + static bool isRequired() { return true; } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_RISCVLOOPUNROLLANDREMAINDER_H diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 6b30ce7f904bb5..0c70f5e67a5266 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/RISCVBaseInfo.h" #include "RISCV.h" #include "RISCVCustomLICM.h" +#include "RISCVLoopUnrollAndRemainder.h" #include "RISCVMachineFunctionInfo.h" #include "RISCVSplitLoopByLength.h" #include "RISCVTargetObjectFile.h" @@ -459,6 +460,10 @@ void RISCVTargetMachine::registerPassBuilderCallbacks( FPM.addPass(RISCVCustomLICMPass()); return true; } + if (Name == "riscv-loop-unroll-and-remainder") { + FPM.addPass(RISCVLoopUnrollAndRemainderPass()); + return true; + } return false; }); @@ -467,9 +472,11 @@ void RISCVTargetMachine::registerPassBuilderCallbacks( if(EnableEsp32P4Optimize && (Level == OptimizationLevel::O3 || Level == OptimizationLevel::O2)){ EnableRISCVSplitLoopByLength = true; EnableRISCVCustomLICM = true; + EnableRISCVLoopUnrollAndRemainder = true; FunctionPassManager FPM; FPM.addPass(RISCVSplitLoopByLengthPass()); FPM.addPass(RISCVCustomLICMPass()); + FPM.addPass(RISCVLoopUnrollAndRemainderPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } }); diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll index 3960501c6ff11d..a608ae2933aecf 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/add.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, ptr noundef readonly %input2, ptr noundef writeonly %output, i32 noundef %len, i32 noundef %step1, i32 noundef %step2, i32 noundef %step_out) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_add_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[INPUT1:%.*]], ptr noundef readonly [[INPUT2:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[INPUT1:%.*]], ptr noalias noundef readonly [[INPUT2:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[INPUT1]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[INPUT2]], null @@ -12,19 +12,159 @@ define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: br i1 [[OR_COND19]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: ; CHECK-NEXT: [[CMP720:%.*]] = icmp sgt i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[CMP720]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]] +; CHECK: for.cond.preheader.new: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[LEN]], -16 +; CHECK-NEXT: [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16 +; CHECK-NEXT: br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]] +; CHECK: for.cond.preheader.new2: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]] +; CHECK: for.body.modify: +; CHECK-NEXT: [[I_021_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[TMP1]] = add nuw i32 [[I_021_MODIFY]], 16 +; CHECK-NEXT: [[ADD2:%.*]] = or disjoint i32 [[I_021_MODIFY]], 1 +; CHECK-NEXT: [[ADD7:%.*]] = or disjoint i32 [[I_021_MODIFY]], 2 +; CHECK-NEXT: [[ADD13:%.*]] = or disjoint i32 [[I_021_MODIFY]], 3 +; CHECK-NEXT: [[ADD18:%.*]] = or disjoint i32 [[I_021_MODIFY]], 4 +; CHECK-NEXT: [[ADD23:%.*]] = or disjoint i32 [[I_021_MODIFY]], 5 +; CHECK-NEXT: [[ADD28:%.*]] = or disjoint i32 [[I_021_MODIFY]], 6 +; CHECK-NEXT: [[ADD33:%.*]] = or disjoint i32 [[I_021_MODIFY]], 7 +; CHECK-NEXT: [[ADD38:%.*]] = or disjoint i32 [[I_021_MODIFY]], 8 +; CHECK-NEXT: [[ADD43:%.*]] = or disjoint i32 [[I_021_MODIFY]], 9 +; CHECK-NEXT: [[ADD48:%.*]] = or disjoint i32 [[I_021_MODIFY]], 10 +; CHECK-NEXT: [[ADD53:%.*]] = or disjoint i32 [[I_021_MODIFY]], 11 +; CHECK-NEXT: [[ADD58:%.*]] = or disjoint i32 [[I_021_MODIFY]], 12 +; CHECK-NEXT: [[ADD63:%.*]] = or disjoint i32 [[I_021_MODIFY]], 13 +; CHECK-NEXT: [[ADD68:%.*]] = or disjoint i32 [[I_021_MODIFY]], 14 +; CHECK-NEXT: [[ADD73:%.*]] = or disjoint i32 [[I_021_MODIFY]], 15 +; CHECK-NEXT: [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[I_021_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX9_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[I_021_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX11_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_021_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD2]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD2]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD2]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD7]] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD7]] +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD7]] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD13]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD13]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD13]] +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD23]] +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD23]] +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD23]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD28]] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD28]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD28]] +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD33]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD33]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD33]] +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD43]] +; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD43]] +; CHECK-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD43]] +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD48]] +; CHECK-NEXT: [[ARRAYIDX50:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD48]] +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD48]] +; CHECK-NEXT: [[ARRAYIDX54:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD53]] +; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD53]] +; CHECK-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD53]] +; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX62:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD63]] +; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD63]] +; CHECK-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD63]] +; CHECK-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD68]] +; CHECK-NEXT: [[ARRAYIDX70:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD68]] +; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD68]] +; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD73]] +; CHECK-NEXT: [[ARRAYIDX75:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD73]] +; CHECK-NEXT: [[ARRAYIDX77:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD73]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_MODIFY]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX29]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX30]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX40]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX45]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX49]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX50]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX54]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX55]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX59]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = load float, ptr [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX65]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX69]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX70]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX74]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX75]], align 4 +; CHECK-NEXT: [[ADD_MODIFY:%.*]] = fadd float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[ADD11:%.*]] = fadd float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[ADD21:%.*]] = fadd float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[ADD26:%.*]] = fadd float [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[ADD36:%.*]] = fadd float [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[ADD41:%.*]] = fadd float [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[ADD46:%.*]] = fadd float [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[ADD51:%.*]] = fadd float [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[ADD56:%.*]] = fadd float [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[ADD61:%.*]] = fadd float [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[ADD66:%.*]] = fadd float [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[ADD71:%.*]] = fadd float [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[ADD76:%.*]] = fadd float [[TMP32]], [[TMP33]] +; CHECK-NEXT: store float [[ADD_MODIFY]], ptr [[ARRAYIDX11_MODIFY]], align 4 +; CHECK-NEXT: store float [[ADD5]], ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: store float [[ADD11]], ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: store float [[ADD16]], ptr [[ARRAYIDX17]], align 4 +; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX22]], align 4 +; CHECK-NEXT: store float [[ADD26]], ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: store float [[ADD31]], ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: store float [[ADD36]], ptr [[ARRAYIDX37]], align 4 +; CHECK-NEXT: store float [[ADD41]], ptr [[ARRAYIDX42]], align 4 +; CHECK-NEXT: store float [[ADD46]], ptr [[ARRAYIDX47]], align 4 +; CHECK-NEXT: store float [[ADD51]], ptr [[ARRAYIDX52]], align 4 +; CHECK-NEXT: store float [[ADD56]], ptr [[ARRAYIDX57]], align 4 +; CHECK-NEXT: store float [[ADD61]], ptr [[ARRAYIDX62]], align 4 +; CHECK-NEXT: store float [[ADD66]], ptr [[ARRAYIDX67]], align 4 +; CHECK-NEXT: store float [[ADD71]], ptr [[ARRAYIDX72]], align 4 +; CHECK-NEXT: store float [[ADD76]], ptr [[ARRAYIDX77]], align 4 +; CHECK-NEXT: [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]] ; CHECK: for.body: -; CHECK-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ] +; CHECK-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_021]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL8:%.*]] = mul nsw i32 [[I_021]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP34]], [[TMP35]] ; CHECK-NEXT: [[MUL10:%.*]] = mul nsw i32 [[I_021]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10]] ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX11]], align 4 @@ -35,11 +175,11 @@ define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: [[I_021_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[MUL_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL_CLONE]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 ; CHECK-NEXT: [[MUL8_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX9_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8_CLONE]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4 -; CHECK-NEXT: [[ADD_CLONE:%.*]] = fadd float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4 +; CHECK-NEXT: [[ADD_CLONE:%.*]] = fadd float [[TMP36]], [[TMP37]] ; CHECK-NEXT: [[MUL10_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX11_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10_CLONE]] ; CHECK-NEXT: store float [[ADD_CLONE]], ptr [[ARRAYIDX11_CLONE]], align 4 @@ -47,7 +187,7 @@ define dso_local noundef i32 @dsps_add_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll index dd35ce0373fc6d..bf98ec71686bc5 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/addc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, ptr noundef writeonly %output, i32 noundef %len, float noundef %C, i32 noundef %step_in, i32 noundef %step_out) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_addc_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[INPUT:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], float noundef [[C:%.*]], i32 noundef [[STEP_IN:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[INPUT:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], float noundef [[C:%.*]], i32 noundef [[STEP_IN:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[INPUT]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[OUTPUT]], null @@ -10,16 +10,124 @@ define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, pt ; CHECK-NEXT: br i1 [[OR_COND]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: ; CHECK-NEXT: [[CMP412:%.*]] = icmp sgt i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[CMP412]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]] +; CHECK: for.cond.preheader.new: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[LEN]], -16 +; CHECK-NEXT: [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16 +; CHECK-NEXT: br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]] +; CHECK: for.cond.preheader.new2: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]] +; CHECK: for.body.modify: +; CHECK-NEXT: [[I_013_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[TMP1]] = add nuw i32 [[I_013_MODIFY]], 16 +; CHECK-NEXT: [[ADD2:%.*]] = or disjoint i32 [[I_013_MODIFY]], 1 +; CHECK-NEXT: [[ADD6:%.*]] = or disjoint i32 [[I_013_MODIFY]], 2 +; CHECK-NEXT: [[ADD10:%.*]] = or disjoint i32 [[I_013_MODIFY]], 3 +; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[I_013_MODIFY]], 4 +; CHECK-NEXT: [[ADD18:%.*]] = or disjoint i32 [[I_013_MODIFY]], 5 +; CHECK-NEXT: [[ADD22:%.*]] = or disjoint i32 [[I_013_MODIFY]], 6 +; CHECK-NEXT: [[ADD26:%.*]] = or disjoint i32 [[I_013_MODIFY]], 7 +; CHECK-NEXT: [[ADD30:%.*]] = or disjoint i32 [[I_013_MODIFY]], 8 +; CHECK-NEXT: [[ADD34:%.*]] = or disjoint i32 [[I_013_MODIFY]], 9 +; CHECK-NEXT: [[ADD38:%.*]] = or disjoint i32 [[I_013_MODIFY]], 10 +; CHECK-NEXT: [[ADD42:%.*]] = or disjoint i32 [[I_013_MODIFY]], 11 +; CHECK-NEXT: [[ADD46:%.*]] = or disjoint i32 [[I_013_MODIFY]], 12 +; CHECK-NEXT: [[ADD50:%.*]] = or disjoint i32 [[I_013_MODIFY]], 13 +; CHECK-NEXT: [[ADD54:%.*]] = or disjoint i32 [[I_013_MODIFY]], 14 +; CHECK-NEXT: [[ADD58:%.*]] = or disjoint i32 [[I_013_MODIFY]], 15 +; CHECK-NEXT: [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_013_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX6_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_013_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD2]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD2]] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD6]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD6]] +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD10]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD10]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX53:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX23]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX43]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX47]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX55]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX59]], align 4 +; CHECK-NEXT: [[ADD_MODIFY:%.*]] = fadd float [[C]], [[TMP2]] +; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[C]], [[TMP3]] +; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[C]], [[TMP4]] +; CHECK-NEXT: [[ADD12:%.*]] = fadd float [[C]], [[TMP5]] +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[C]], [[TMP6]] +; CHECK-NEXT: [[ADD20:%.*]] = fadd float [[C]], [[TMP7]] +; CHECK-NEXT: [[ADD24:%.*]] = fadd float [[C]], [[TMP8]] +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[C]], [[TMP9]] +; CHECK-NEXT: [[ADD32:%.*]] = fadd float [[C]], [[TMP10]] +; CHECK-NEXT: [[ADD36:%.*]] = fadd float [[C]], [[TMP11]] +; CHECK-NEXT: [[ADD40:%.*]] = fadd float [[C]], [[TMP12]] +; CHECK-NEXT: [[ADD44:%.*]] = fadd float [[C]], [[TMP13]] +; CHECK-NEXT: [[ADD48:%.*]] = fadd float [[C]], [[TMP14]] +; CHECK-NEXT: [[ADD52:%.*]] = fadd float [[C]], [[TMP15]] +; CHECK-NEXT: [[ADD56:%.*]] = fadd float [[C]], [[TMP16]] +; CHECK-NEXT: [[ADD60:%.*]] = fadd float [[C]], [[TMP17]] +; CHECK-NEXT: store float [[ADD_MODIFY]], ptr [[ARRAYIDX6_MODIFY]], align 4 +; CHECK-NEXT: store float [[ADD4]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: store float [[ADD8]], ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: store float [[ADD12]], ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: store float [[ADD16]], ptr [[ARRAYIDX17]], align 4 +; CHECK-NEXT: store float [[ADD20]], ptr [[ARRAYIDX21]], align 4 +; CHECK-NEXT: store float [[ADD24]], ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX29]], align 4 +; CHECK-NEXT: store float [[ADD32]], ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: store float [[ADD36]], ptr [[ARRAYIDX37]], align 4 +; CHECK-NEXT: store float [[ADD40]], ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: store float [[ADD44]], ptr [[ARRAYIDX45]], align 4 +; CHECK-NEXT: store float [[ADD48]], ptr [[ARRAYIDX49]], align 4 +; CHECK-NEXT: store float [[ADD52]], ptr [[ARRAYIDX53]], align 4 +; CHECK-NEXT: store float [[ADD56]], ptr [[ARRAYIDX57]], align 4 +; CHECK-NEXT: store float [[ADD60]], ptr [[ARRAYIDX61]], align 4 +; CHECK-NEXT: [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]] ; CHECK: for.body: -; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ] +; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_013]], [[STEP_IN]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[C]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[C]], [[TMP18]] ; CHECK-NEXT: [[MUL5:%.*]] = mul nsw i32 [[I_013]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL5]] ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX6]], align 4 @@ -30,8 +138,8 @@ define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, pt ; CHECK-NEXT: [[I_013_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[MUL_CLONE:%.*]] = mul nsw i32 [[I_013_CLONE]], [[STEP_IN]] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL_CLONE]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 -; CHECK-NEXT: [[ADD_CLONE:%.*]] = fadd float [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[ADD_CLONE:%.*]] = fadd float [[C]], [[TMP19]] ; CHECK-NEXT: [[MUL5_CLONE:%.*]] = mul nsw i32 [[I_013_CLONE]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX6_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL5_CLONE]] ; CHECK-NEXT: store float [[ADD_CLONE]], ptr [[ARRAYIDX6_CLONE]], align 4 @@ -39,7 +147,7 @@ define dso_local noundef i32 @dsps_addc_f32_ansi(ptr noundef readonly %input, pt ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll index 11c9c556d526e6..0432a51dfbb38e 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/ccorr.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_ccorr_f32_ansi(ptr noundef readonly %Signal, i32 noundef %siglen, ptr noundef readonly %Kernel, i32 noundef %kernlen, ptr noundef writeonly %corrvout) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_ccorr_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noundef writeonly [[CORRVOUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noalias noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noalias noundef writeonly [[CORRVOUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[SIGNAL]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[KERNEL]], null @@ -21,36 +21,131 @@ define dso_local noundef i32 @dsps_ccorr_f32_ansi(ptr noundef readonly %Signal, ; CHECK-NEXT: [[KERN_0:%.*]] = phi ptr [ [[SIGNAL]], [[IF_THEN8]] ], [ [[KERNEL]], [[IF_END6]] ] ; CHECK-NEXT: [[SIG_0:%.*]] = phi ptr [ [[KERNEL]], [[IF_THEN8]] ], [ [[SIGNAL]], [[IF_END6]] ] ; CHECK-NEXT: [[CMP10124:%.*]] = icmp sgt i32 [[LKERN_0]], 0 -; CHECK-NEXT: br i1 [[CMP10124]], label [[FOR_BODY:%.*]], label [[FOR_COND22_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP10124]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND22_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond22.preheader.loopexit: +; CHECK-NEXT: br label [[FOR_COND22_PREHEADER]] ; CHECK: for.cond22.preheader: ; CHECK-NEXT: [[CMP23128:%.*]] = icmp slt i32 [[LKERN_0]], [[LSIG_0]] -; CHECK-NEXT: br i1 [[CMP23128]], label [[FOR_BODY25:%.*]], label [[FOR_COND45_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP23128]], label [[FOR_BODY25_PREHEADER:%.*]], label [[FOR_COND45_PREHEADER:%.*]] +; CHECK: for.body25.preheader: +; CHECK-NEXT: [[DIV536:%.*]] = and i32 [[LKERN_0]], -16 +; CHECK-NEXT: br label [[FOR_BODY25:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[IF_END9]] ] -; CHECK-NEXT: [[N_0125:%.*]] = phi i32 [ [[INC19:%.*]], [[FOR_END]] ], [ 0, [[IF_END9]] ] -; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[N_0125]], -1 -; CHECK-NEXT: [[SUB11:%.*]] = add nsw i32 [[LKERN_0]], [[TMP0]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[N_0125:%.*]] = phi i32 [ [[INC19:%.*]], [[FOR_END]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[N_0125]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[N_0125]], 2147483640 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[N_0125]], -1 +; CHECK-NEXT: [[SUB11:%.*]] = add nsw i32 [[TMP3]], [[LKERN_0]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[CORRVOUT]], i32 [[N_0125]] -; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: br label [[FOR_BODY14:%.*]] -; CHECK: for.body14: -; CHECK-NEXT: [[K_0123:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP4:%.*]], [[FOR_BODY14]] ] +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY14_CLONE_PREHEADER:%.*]], label [[FOR_BODY14_7:%.*]] +; CHECK: for.body14.7: +; CHECK-NEXT: [[K_0123:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC_7:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP20:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI1:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI2:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP22:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI3:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP23:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI5:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP26:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[DOTPHI7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP27:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[K_0123]], [[SUB11]] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[K_0123]], 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[INC]], [[SUB11]] +; CHECK-NEXT: [[INC_1:%.*]] = add nuw nsw i32 [[K_0123]], 2 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[INC_1]], [[SUB11]] +; CHECK-NEXT: [[INC_2:%.*]] = add nuw nsw i32 [[K_0123]], 3 +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[INC_2]], [[SUB11]] +; CHECK-NEXT: [[INC_3:%.*]] = add nuw nsw i32 [[K_0123]], 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[INC_3]], [[SUB11]] +; CHECK-NEXT: [[INC_4:%.*]] = add nuw nsw i32 [[K_0123]], 5 +; CHECK-NEXT: [[ADD_5:%.*]] = add i32 [[INC_4]], [[SUB11]] +; CHECK-NEXT: [[INC_5:%.*]] = add nuw nsw i32 [[K_0123]], 6 +; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[INC_5]], [[SUB11]] +; CHECK-NEXT: [[INC_6:%.*]] = add nuw nsw i32 [[K_0123]], 7 +; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[INC_6]], [[SUB11]] +; CHECK-NEXT: [[INC_7]] = add nuw nsw i32 [[K_0123]], 8 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0123]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUB11]], [[K_0123]] ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[TMP4]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[TMP1]]) -; CHECK-NEXT: store float [[TMP4]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[K_0123]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY14]] +; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC]] +; CHECK-NEXT: [[ARRAYIDX16_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_1]] +; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_1]] +; CHECK-NEXT: [[ARRAYIDX16_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_2]] +; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_2]] +; CHECK-NEXT: [[ARRAYIDX16_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_3]] +; CHECK-NEXT: [[ARRAYIDX15_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_3]] +; CHECK-NEXT: [[ARRAYIDX16_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_4]] +; CHECK-NEXT: [[ARRAYIDX15_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_4]] +; CHECK-NEXT: [[ARRAYIDX16_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_5]] +; CHECK-NEXT: [[ARRAYIDX15_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_5]] +; CHECK-NEXT: [[ARRAYIDX16_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_6]] +; CHECK-NEXT: [[ARRAYIDX15_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_6]] +; CHECK-NEXT: [[ARRAYIDX16_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_7]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX15_1]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX16_1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX15_2]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16_2]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15_3]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX16_3]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX15_4]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX16_4]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX15_5]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX16_5]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX15_6]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX16_6]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX15_7]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX16_7]], align 4 +; CHECK-NEXT: [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[DOTPHI]]) +; CHECK-NEXT: [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[DOTPHI1]]) +; CHECK-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[DOTPHI2]]) +; CHECK-NEXT: [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[DOTPHI3]]) +; CHECK-NEXT: [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[DOTPHI4]]) +; CHECK-NEXT: [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[DOTPHI5]]) +; CHECK-NEXT: [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP16]], float [[TMP17]], float [[DOTPHI6]]) +; CHECK-NEXT: [[TMP27]] = tail call float @llvm.fmuladd.f32(float [[TMP18]], float [[TMP19]], float [[DOTPHI7]]) +; CHECK-NEXT: [[EXITCOND_7:%.*]] = icmp ult i32 [[INC_7]], [[TMP1]] +; CHECK-NEXT: br i1 [[EXITCOND_7]], label [[FOR_BODY14_7]], label [[FOR_END8:%.*]] +; CHECK: for.end8: +; CHECK-NEXT: [[SUM:%.*]] = fadd float [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[SUM23:%.*]] = fadd float [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[SUM24:%.*]] = fadd float [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[SUM25:%.*]] = fadd float [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[SUM26:%.*]] = fadd float [[SUM]], [[SUM23]] +; CHECK-NEXT: [[SUM27:%.*]] = fadd float [[SUM24]], [[SUM25]] +; CHECK-NEXT: [[SUM28:%.*]] = fadd float [[SUM26]], [[SUM27]] +; CHECK-NEXT: store float [[SUM28]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br i1 false, label [[FOR_END]], label [[FOR_BODY14_CLONE_PREHEADER]] +; CHECK: for.body14.clone.preheader: +; CHECK-NEXT: [[SUM_PHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[SUM28]], [[FOR_END8]] ] +; CHECK-NEXT: [[ADD_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP0]], [[FOR_END8]] ] +; CHECK-NEXT: br label [[FOR_BODY14_CLONE:%.*]] +; CHECK: for.body14.clone: +; CHECK-NEXT: [[K_0123_CLONE:%.*]] = phi i32 [ [[ADD_PHI]], [[FOR_BODY14_CLONE_PREHEADER]] ], [ [[INC_CLONE:%.*]], [[FOR_BODY14_CLONE]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi float [ [[SUM_PHI]], [[FOR_BODY14_CLONE_PREHEADER]] ], [ [[TMP31:%.*]], [[FOR_BODY14_CLONE]] ] +; CHECK-NEXT: [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0123_CLONE]] +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4 +; CHECK-NEXT: [[ADD_CLONE:%.*]] = add i32 [[K_0123_CLONE]], [[SUB11]] +; CHECK-NEXT: [[ARRAYIDX16_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[ADD_CLONE]] +; CHECK-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX16_CLONE]], align 4 +; CHECK-NEXT: [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[TMP28]]) +; CHECK-NEXT: [[INC_CLONE]] = add nuw nsw i32 [[K_0123_CLONE]], 1 +; CHECK-NEXT: [[EXITCOND_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[INDVARS_IV]] +; CHECK-NEXT: br i1 [[EXITCOND_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY14_CLONE]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: store float [[TMP31]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[INC19]] = add nuw nsw i32 [[N_0125]], 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND134_NOT:%.*]] = icmp eq i32 [[INC19]], [[LKERN_0]] -; CHECK-NEXT: br i1 [[EXITCOND134_NOT]], label [[FOR_COND22_PREHEADER]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND134_NOT]], label [[FOR_COND22_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.cond45.preheader.loopexit: +; CHECK-NEXT: br label [[FOR_COND45_PREHEADER]] ; CHECK: for.cond45.preheader: ; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[SIGLEN]], -1 ; CHECK-NEXT: [[SUB47:%.*]] = add i32 [[ADD46]], [[KERNLEN]] @@ -60,57 +155,308 @@ define dso_local noundef i32 @dsps_ccorr_f32_ansi(ptr noundef readonly %Signal, ; CHECK-NEXT: [[SUB57:%.*]] = add nsw i32 [[LSIG_0]], -1 ; CHECK-NEXT: br label [[FOR_BODY50:%.*]] ; CHECK: for.body25: -; CHECK-NEXT: [[N21_0129:%.*]] = phi i32 [ [[INC42:%.*]], [[FOR_END40:%.*]] ], [ [[LKERN_0]], [[FOR_COND22_PREHEADER]] ] +; CHECK-NEXT: [[N21_0129:%.*]] = phi i32 [ [[INC42:%.*]], [[FOR_END40:%.*]] ], [ [[LKERN_0]], [[FOR_BODY25_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[CORRVOUT]], i32 [[N21_0129]] -; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX28]], align 4 ; CHECK-NEXT: [[SUB29:%.*]] = sub nuw nsw i32 [[N21_0129]], [[LKERN_0]] ; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[SUB29]], 1 -; CHECK-NEXT: [[CMP32_NOT126:%.*]] = icmp ugt i32 [[ADD30]], [[N21_0129]] -; CHECK-NEXT: br i1 [[CMP32_NOT126]], label [[FOR_END40]], label [[FOR_BODY33:%.*]] -; CHECK: for.body33: -; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP8:%.*]], [[FOR_BODY33]] ], [ 0.000000e+00, [[FOR_BODY25]] ] -; CHECK-NEXT: [[K27_0127:%.*]] = phi i32 [ [[INC39:%.*]], [[FOR_BODY33]] ], [ [[ADD30]], [[FOR_BODY25]] ] -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K27_0127]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ADD60:%.*]] = add i32 [[ADD30]], [[DIV536]] +; CHECK-NEXT: [[CMP32_NOT126:%.*]] = icmp ult i32 [[ADD30]], [[ADD60]] +; CHECK-NEXT: br i1 [[CMP32_NOT126]], label [[FOR_BODY33_PREHEADER:%.*]], label [[FOR_END164:%.*]] +; CHECK: for.body33.preheader: +; CHECK-NEXT: br label [[FOR_BODY33_15:%.*]] +; CHECK: for.body33.15: +; CHECK-NEXT: [[K27_0127:%.*]] = phi i32 [ [[ADD30]], [[FOR_BODY33_PREHEADER]] ], [ [[INC39_15:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP64:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI10:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP65:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI11:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP66:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI12:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP67:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI13:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP68:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI14:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP69:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI15:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP70:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI16:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP71:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI17:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP72:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI18:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP73:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI19:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP74:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI20:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP75:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI21:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP76:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI22:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP77:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI23:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP78:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[DOTPHI24:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY33_PREHEADER]] ], [ [[TMP79:%.*]], [[FOR_BODY33_15]] ] +; CHECK-NEXT: [[INC39:%.*]] = add i32 [[K27_0127]], 1 +; CHECK-NEXT: [[INC39_1:%.*]] = add i32 [[K27_0127]], 2 +; CHECK-NEXT: [[INC39_2:%.*]] = add i32 [[K27_0127]], 3 +; CHECK-NEXT: [[INC39_3:%.*]] = add i32 [[K27_0127]], 4 +; CHECK-NEXT: [[INC39_4:%.*]] = add i32 [[K27_0127]], 5 +; CHECK-NEXT: [[INC39_5:%.*]] = add i32 [[K27_0127]], 6 +; CHECK-NEXT: [[INC39_6:%.*]] = add i32 [[K27_0127]], 7 +; CHECK-NEXT: [[INC39_7:%.*]] = add i32 [[K27_0127]], 8 +; CHECK-NEXT: [[INC39_8:%.*]] = add i32 [[K27_0127]], 9 +; CHECK-NEXT: [[INC39_9:%.*]] = add i32 [[K27_0127]], 10 +; CHECK-NEXT: [[INC39_10:%.*]] = add i32 [[K27_0127]], 11 +; CHECK-NEXT: [[INC39_11:%.*]] = add i32 [[K27_0127]], 12 +; CHECK-NEXT: [[INC39_12:%.*]] = add i32 [[K27_0127]], 13 +; CHECK-NEXT: [[INC39_13:%.*]] = add i32 [[K27_0127]], 14 +; CHECK-NEXT: [[INC39_14:%.*]] = add i32 [[K27_0127]], 15 +; CHECK-NEXT: [[INC39_15]] = add i32 [[K27_0127]], 16 ; CHECK-NEXT: [[SUB35:%.*]] = sub i32 [[K27_0127]], [[ADD30]] +; CHECK-NEXT: [[SUB35_1:%.*]] = sub i32 [[INC39]], [[ADD30]] +; CHECK-NEXT: [[SUB35_2:%.*]] = sub i32 [[INC39_1]], [[ADD30]] +; CHECK-NEXT: [[SUB35_3:%.*]] = sub i32 [[INC39_2]], [[ADD30]] +; CHECK-NEXT: [[SUB35_4:%.*]] = sub i32 [[INC39_3]], [[ADD30]] +; CHECK-NEXT: [[SUB35_5:%.*]] = sub i32 [[INC39_4]], [[ADD30]] +; CHECK-NEXT: [[SUB35_6:%.*]] = sub i32 [[INC39_5]], [[ADD30]] +; CHECK-NEXT: [[SUB35_7:%.*]] = sub i32 [[INC39_6]], [[ADD30]] +; CHECK-NEXT: [[SUB35_8:%.*]] = sub i32 [[INC39_7]], [[ADD30]] +; CHECK-NEXT: [[SUB35_9:%.*]] = sub i32 [[INC39_8]], [[ADD30]] +; CHECK-NEXT: [[SUB35_10:%.*]] = sub i32 [[INC39_9]], [[ADD30]] +; CHECK-NEXT: [[SUB35_11:%.*]] = sub i32 [[INC39_10]], [[ADD30]] +; CHECK-NEXT: [[SUB35_12:%.*]] = sub i32 [[INC39_11]], [[ADD30]] +; CHECK-NEXT: [[SUB35_13:%.*]] = sub i32 [[INC39_12]], [[ADD30]] +; CHECK-NEXT: [[SUB35_14:%.*]] = sub i32 [[INC39_13]], [[ADD30]] +; CHECK-NEXT: [[SUB35_15:%.*]] = sub i32 [[INC39_14]], [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K27_0127]] ; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX36]], align 4 -; CHECK-NEXT: [[TMP8]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[TMP5]]) -; CHECK-NEXT: store float [[TMP8]], ptr [[ARRAYIDX28]], align 4 -; CHECK-NEXT: [[INC39]] = add i32 [[K27_0127]], 1 -; CHECK-NEXT: [[CMP32_NOT:%.*]] = icmp ugt i32 [[INC39]], [[N21_0129]] -; CHECK-NEXT: br i1 [[CMP32_NOT]], label [[FOR_END40]], label [[FOR_BODY33]] +; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39]] +; CHECK-NEXT: [[ARRAYIDX36_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_1]] +; CHECK-NEXT: [[ARRAYIDX34_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_1]] +; CHECK-NEXT: [[ARRAYIDX36_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_2]] +; CHECK-NEXT: [[ARRAYIDX34_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_2]] +; CHECK-NEXT: [[ARRAYIDX36_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_3]] +; CHECK-NEXT: [[ARRAYIDX34_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_3]] +; CHECK-NEXT: [[ARRAYIDX36_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_4]] +; CHECK-NEXT: [[ARRAYIDX34_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_4]] +; CHECK-NEXT: [[ARRAYIDX36_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_5]] +; CHECK-NEXT: [[ARRAYIDX34_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_5]] +; CHECK-NEXT: [[ARRAYIDX36_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_6]] +; CHECK-NEXT: [[ARRAYIDX34_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_6]] +; CHECK-NEXT: [[ARRAYIDX36_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_7]] +; CHECK-NEXT: [[ARRAYIDX34_8:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_7]] +; CHECK-NEXT: [[ARRAYIDX36_8:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_8]] +; CHECK-NEXT: [[ARRAYIDX34_9:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_8]] +; CHECK-NEXT: [[ARRAYIDX36_9:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_9]] +; CHECK-NEXT: [[ARRAYIDX34_10:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_9]] +; CHECK-NEXT: [[ARRAYIDX36_10:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_10]] +; CHECK-NEXT: [[ARRAYIDX34_11:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_10]] +; CHECK-NEXT: [[ARRAYIDX36_11:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_11]] +; CHECK-NEXT: [[ARRAYIDX34_12:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_11]] +; CHECK-NEXT: [[ARRAYIDX36_12:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_12]] +; CHECK-NEXT: [[ARRAYIDX34_13:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_12]] +; CHECK-NEXT: [[ARRAYIDX36_13:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_13]] +; CHECK-NEXT: [[ARRAYIDX34_14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_13]] +; CHECK-NEXT: [[ARRAYIDX36_14:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_14]] +; CHECK-NEXT: [[ARRAYIDX34_15:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC39_14]] +; CHECK-NEXT: [[ARRAYIDX36_15:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_15]] +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX36]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX34_1]], align 4 +; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[ARRAYIDX36_1]], align 4 +; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[ARRAYIDX34_2]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[ARRAYIDX36_2]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[ARRAYIDX34_3]], align 4 +; CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[ARRAYIDX36_3]], align 4 +; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[ARRAYIDX34_4]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[ARRAYIDX36_4]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[ARRAYIDX34_5]], align 4 +; CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[ARRAYIDX36_5]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[ARRAYIDX34_6]], align 4 +; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[ARRAYIDX36_6]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[ARRAYIDX34_7]], align 4 +; CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[ARRAYIDX36_7]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[ARRAYIDX34_8]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[ARRAYIDX36_8]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = load float, ptr [[ARRAYIDX34_9]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[ARRAYIDX36_9]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[ARRAYIDX34_10]], align 4 +; CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[ARRAYIDX36_10]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = load float, ptr [[ARRAYIDX34_11]], align 4 +; CHECK-NEXT: [[TMP55:%.*]] = load float, ptr [[ARRAYIDX36_11]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = load float, ptr [[ARRAYIDX34_12]], align 4 +; CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[ARRAYIDX36_12]], align 4 +; CHECK-NEXT: [[TMP58:%.*]] = load float, ptr [[ARRAYIDX34_13]], align 4 +; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[ARRAYIDX36_13]], align 4 +; CHECK-NEXT: [[TMP60:%.*]] = load float, ptr [[ARRAYIDX34_14]], align 4 +; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[ARRAYIDX36_14]], align 4 +; CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[ARRAYIDX34_15]], align 4 +; CHECK-NEXT: [[TMP63:%.*]] = load float, ptr [[ARRAYIDX36_15]], align 4 +; CHECK-NEXT: [[TMP64]] = tail call float @llvm.fmuladd.f32(float [[TMP32]], float [[TMP33]], float [[DOTPHI9]]) +; CHECK-NEXT: [[TMP65]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[DOTPHI10]]) +; CHECK-NEXT: [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP36]], float [[TMP37]], float [[DOTPHI11]]) +; CHECK-NEXT: [[TMP67]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[DOTPHI12]]) +; CHECK-NEXT: [[TMP68]] = tail call float @llvm.fmuladd.f32(float [[TMP40]], float [[TMP41]], float [[DOTPHI13]]) +; CHECK-NEXT: [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP42]], float [[TMP43]], float [[DOTPHI14]]) +; CHECK-NEXT: [[TMP70]] = tail call float @llvm.fmuladd.f32(float [[TMP44]], float [[TMP45]], float [[DOTPHI15]]) +; CHECK-NEXT: [[TMP71]] = tail call float @llvm.fmuladd.f32(float [[TMP46]], float [[TMP47]], float [[DOTPHI16]]) +; CHECK-NEXT: [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP48]], float [[TMP49]], float [[DOTPHI17]]) +; CHECK-NEXT: [[TMP73]] = tail call float @llvm.fmuladd.f32(float [[TMP50]], float [[TMP51]], float [[DOTPHI18]]) +; CHECK-NEXT: [[TMP74]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP53]], float [[DOTPHI19]]) +; CHECK-NEXT: [[TMP75]] = tail call float @llvm.fmuladd.f32(float [[TMP54]], float [[TMP55]], float [[DOTPHI20]]) +; CHECK-NEXT: [[TMP76]] = tail call float @llvm.fmuladd.f32(float [[TMP56]], float [[TMP57]], float [[DOTPHI21]]) +; CHECK-NEXT: [[TMP77]] = tail call float @llvm.fmuladd.f32(float [[TMP58]], float [[TMP59]], float [[DOTPHI22]]) +; CHECK-NEXT: [[TMP78]] = tail call float @llvm.fmuladd.f32(float [[TMP60]], float [[TMP61]], float [[DOTPHI23]]) +; CHECK-NEXT: [[TMP79]] = tail call float @llvm.fmuladd.f32(float [[TMP62]], float [[TMP63]], float [[DOTPHI24]]) +; CHECK-NEXT: [[CMP32_NOT_15:%.*]] = icmp ult i32 [[INC39_15]], [[ADD60]] +; CHECK-NEXT: br i1 [[CMP32_NOT_15]], label [[FOR_BODY33_15]], label [[FOR_END40_LOOPEXIT:%.*]] +; CHECK: for.end40.loopexit: +; CHECK-NEXT: [[SUM45:%.*]] = fadd float [[TMP64]], [[TMP65]] +; CHECK-NEXT: [[SUM46:%.*]] = fadd float [[TMP66]], [[TMP67]] +; CHECK-NEXT: [[SUM47:%.*]] = fadd float [[TMP68]], [[TMP69]] +; CHECK-NEXT: [[SUM48:%.*]] = fadd float [[TMP70]], [[TMP71]] +; CHECK-NEXT: [[SUM49:%.*]] = fadd float [[TMP72]], [[TMP73]] +; CHECK-NEXT: [[SUM50:%.*]] = fadd float [[TMP74]], [[TMP75]] +; CHECK-NEXT: [[SUM51:%.*]] = fadd float [[TMP76]], [[TMP77]] +; CHECK-NEXT: [[SUM52:%.*]] = fadd float [[TMP78]], [[TMP79]] +; CHECK-NEXT: [[SUM53:%.*]] = fadd float [[SUM45]], [[SUM46]] +; CHECK-NEXT: [[SUM54:%.*]] = fadd float [[SUM47]], [[SUM48]] +; CHECK-NEXT: [[SUM55:%.*]] = fadd float [[SUM49]], [[SUM50]] +; CHECK-NEXT: [[SUM56:%.*]] = fadd float [[SUM51]], [[SUM52]] +; CHECK-NEXT: [[SUM57:%.*]] = fadd float [[SUM53]], [[SUM54]] +; CHECK-NEXT: [[SUM58:%.*]] = fadd float [[SUM55]], [[SUM56]] +; CHECK-NEXT: [[SUM59:%.*]] = fadd float [[SUM57]], [[SUM58]] +; CHECK-NEXT: br label [[FOR_END164]] +; CHECK: for.end164: +; CHECK-NEXT: [[PHI_SUM:%.*]] = phi i32 [ [[ADD30]], [[FOR_BODY25]] ], [ [[INC39_15]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: [[PHI_FLOAT:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY25]] ], [ [[SUM59]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: store float [[PHI_FLOAT]], ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[CMP182_NOT587:%.*]] = icmp ugt i32 [[PHI_SUM]], [[N21_0129]] +; CHECK-NEXT: br i1 [[CMP182_NOT587]], label [[FOR_END40]], label [[FOR_BODY33_CLONE:%.*]] +; CHECK: for.body33.clone: +; CHECK-NEXT: [[TMP80:%.*]] = phi float [ [[TMP83:%.*]], [[FOR_BODY33_CLONE]] ], [ [[PHI_FLOAT]], [[FOR_END164]] ] +; CHECK-NEXT: [[K27_0127_CLONE:%.*]] = phi i32 [ [[INC39_CLONE:%.*]], [[FOR_BODY33_CLONE]] ], [ [[PHI_SUM]], [[FOR_END164]] ] +; CHECK-NEXT: [[ARRAYIDX34_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K27_0127_CLONE]] +; CHECK-NEXT: [[TMP81:%.*]] = load float, ptr [[ARRAYIDX34_CLONE]], align 4 +; CHECK-NEXT: [[SUB35_CLONE:%.*]] = sub i32 [[K27_0127_CLONE]], [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX36_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB35_CLONE]] +; CHECK-NEXT: [[TMP82:%.*]] = load float, ptr [[ARRAYIDX36_CLONE]], align 4 +; CHECK-NEXT: [[TMP83]] = tail call float @llvm.fmuladd.f32(float [[TMP81]], float [[TMP82]], float [[TMP80]]) +; CHECK-NEXT: [[INC39_CLONE]] = add i32 [[K27_0127_CLONE]], 1 +; CHECK-NEXT: [[CMP32_NOT_CLONE:%.*]] = icmp ugt i32 [[INC39_CLONE]], [[N21_0129]] +; CHECK-NEXT: br i1 [[CMP32_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE25:%.*]], label [[FOR_BODY33_CLONE]] +; CHECK: for.cond.for.end_crit_edge25: +; CHECK-NEXT: store float [[TMP83]], ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: br label [[FOR_END40]] ; CHECK: for.end40: ; CHECK-NEXT: [[INC42]] = add nuw nsw i32 [[N21_0129]], 1 ; CHECK-NEXT: [[EXITCOND135_NOT:%.*]] = icmp eq i32 [[INC42]], [[LSIG_0]] -; CHECK-NEXT: br i1 [[EXITCOND135_NOT]], label [[FOR_COND45_PREHEADER]], label [[FOR_BODY25]] +; CHECK-NEXT: br i1 [[EXITCOND135_NOT]], label [[FOR_COND45_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY25]] ; CHECK: for.body50: ; CHECK-NEXT: [[N44_0133:%.*]] = phi i32 [ [[LSIG_0]], [[FOR_BODY50_LR_PH]] ], [ [[INC69:%.*]], [[FOR_END67:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX54:%.*]] = getelementptr inbounds float, ptr [[CORRVOUT]], i32 [[N44_0133]] -; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX54]], align 4 ; CHECK-NEXT: [[SUB55:%.*]] = sub nsw i32 [[N44_0133]], [[LKERN_0]] ; CHECK-NEXT: [[ADD56:%.*]] = add nsw i32 [[SUB55]], 1 -; CHECK-NEXT: [[CMP59_NOT130:%.*]] = icmp ugt i32 [[ADD56]], [[SUB57]] -; CHECK-NEXT: br i1 [[CMP59_NOT130]], label [[FOR_END67]], label [[FOR_BODY60:%.*]] -; CHECK: for.body60: -; CHECK-NEXT: [[TMP9:%.*]] = phi float [ [[TMP12:%.*]], [[FOR_BODY60]] ], [ 0.000000e+00, [[FOR_BODY50]] ] -; CHECK-NEXT: [[K53_0131:%.*]] = phi i32 [ [[INC66:%.*]], [[FOR_BODY60]] ], [ [[ADD56]], [[FOR_BODY50]] ] -; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K53_0131]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX61]], align 4 +; CHECK-NEXT: [[ADD207_NEG:%.*]] = xor i32 [[SUB55]], -1 +; CHECK-NEXT: [[ADD211:%.*]] = add i32 [[ADD207_NEG]], [[LSIG_0]] +; CHECK-NEXT: [[DIV212535:%.*]] = and i32 [[ADD211]], -8 +; CHECK-NEXT: [[ADD214:%.*]] = add i32 [[DIV212535]], [[ADD56]] +; CHECK-NEXT: [[CMP59_NOT130:%.*]] = icmp ult i32 [[ADD56]], [[ADD214]] +; CHECK-NEXT: br i1 [[CMP59_NOT130]], label [[FOR_BODY60_PREHEADER:%.*]], label [[FOR_END16434:%.*]] +; CHECK: for.body60.preheader: +; CHECK-NEXT: br label [[FOR_BODY60_7:%.*]] +; CHECK: for.body60.7: +; CHECK-NEXT: [[K53_0131:%.*]] = phi i32 [ [[ADD56]], [[FOR_BODY60_PREHEADER]] ], [ [[INC66_7:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI26:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP100:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI27:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP101:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI28:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP102:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI29:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP103:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI30:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP104:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI31:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP105:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI32:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP106:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[DOTPHI33:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY60_PREHEADER]] ], [ [[TMP107:%.*]], [[FOR_BODY60_7]] ] +; CHECK-NEXT: [[INC66:%.*]] = add i32 [[K53_0131]], 1 +; CHECK-NEXT: [[INC66_1:%.*]] = add i32 [[K53_0131]], 2 +; CHECK-NEXT: [[INC66_2:%.*]] = add i32 [[K53_0131]], 3 +; CHECK-NEXT: [[INC66_3:%.*]] = add i32 [[K53_0131]], 4 +; CHECK-NEXT: [[INC66_4:%.*]] = add i32 [[K53_0131]], 5 +; CHECK-NEXT: [[INC66_5:%.*]] = add i32 [[K53_0131]], 6 +; CHECK-NEXT: [[INC66_6:%.*]] = add i32 [[K53_0131]], 7 +; CHECK-NEXT: [[INC66_7]] = add i32 [[K53_0131]], 8 ; CHECK-NEXT: [[SUB62:%.*]] = sub i32 [[K53_0131]], [[ADD56]] +; CHECK-NEXT: [[SUB62_1:%.*]] = sub i32 [[INC66]], [[ADD56]] +; CHECK-NEXT: [[SUB62_2:%.*]] = sub i32 [[INC66_1]], [[ADD56]] +; CHECK-NEXT: [[SUB62_3:%.*]] = sub i32 [[INC66_2]], [[ADD56]] +; CHECK-NEXT: [[SUB62_4:%.*]] = sub i32 [[INC66_3]], [[ADD56]] +; CHECK-NEXT: [[SUB62_5:%.*]] = sub i32 [[INC66_4]], [[ADD56]] +; CHECK-NEXT: [[SUB62_6:%.*]] = sub i32 [[INC66_5]], [[ADD56]] +; CHECK-NEXT: [[SUB62_7:%.*]] = sub i32 [[INC66_6]], [[ADD56]] +; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K53_0131]] ; CHECK-NEXT: [[ARRAYIDX63:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX63]], align 4 -; CHECK-NEXT: [[TMP12]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[TMP9]]) -; CHECK-NEXT: store float [[TMP12]], ptr [[ARRAYIDX54]], align 4 -; CHECK-NEXT: [[INC66]] = add i32 [[K53_0131]], 1 -; CHECK-NEXT: [[CMP59_NOT:%.*]] = icmp ugt i32 [[INC66]], [[SUB57]] -; CHECK-NEXT: br i1 [[CMP59_NOT]], label [[FOR_END67]], label [[FOR_BODY60]] +; CHECK-NEXT: [[ARRAYIDX61_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66]] +; CHECK-NEXT: [[ARRAYIDX63_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_1]] +; CHECK-NEXT: [[ARRAYIDX61_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_1]] +; CHECK-NEXT: [[ARRAYIDX63_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_2]] +; CHECK-NEXT: [[ARRAYIDX61_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_2]] +; CHECK-NEXT: [[ARRAYIDX63_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_3]] +; CHECK-NEXT: [[ARRAYIDX61_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_3]] +; CHECK-NEXT: [[ARRAYIDX63_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_4]] +; CHECK-NEXT: [[ARRAYIDX61_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_4]] +; CHECK-NEXT: [[ARRAYIDX63_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_5]] +; CHECK-NEXT: [[ARRAYIDX61_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_5]] +; CHECK-NEXT: [[ARRAYIDX63_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_6]] +; CHECK-NEXT: [[ARRAYIDX61_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC66_6]] +; CHECK-NEXT: [[ARRAYIDX63_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_7]] +; CHECK-NEXT: [[TMP84:%.*]] = load float, ptr [[ARRAYIDX61]], align 4 +; CHECK-NEXT: [[TMP85:%.*]] = load float, ptr [[ARRAYIDX63]], align 4 +; CHECK-NEXT: [[TMP86:%.*]] = load float, ptr [[ARRAYIDX61_1]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = load float, ptr [[ARRAYIDX63_1]], align 4 +; CHECK-NEXT: [[TMP88:%.*]] = load float, ptr [[ARRAYIDX61_2]], align 4 +; CHECK-NEXT: [[TMP89:%.*]] = load float, ptr [[ARRAYIDX63_2]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = load float, ptr [[ARRAYIDX61_3]], align 4 +; CHECK-NEXT: [[TMP91:%.*]] = load float, ptr [[ARRAYIDX63_3]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = load float, ptr [[ARRAYIDX61_4]], align 4 +; CHECK-NEXT: [[TMP93:%.*]] = load float, ptr [[ARRAYIDX63_4]], align 4 +; CHECK-NEXT: [[TMP94:%.*]] = load float, ptr [[ARRAYIDX61_5]], align 4 +; CHECK-NEXT: [[TMP95:%.*]] = load float, ptr [[ARRAYIDX63_5]], align 4 +; CHECK-NEXT: [[TMP96:%.*]] = load float, ptr [[ARRAYIDX61_6]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = load float, ptr [[ARRAYIDX63_6]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = load float, ptr [[ARRAYIDX61_7]], align 4 +; CHECK-NEXT: [[TMP99:%.*]] = load float, ptr [[ARRAYIDX63_7]], align 4 +; CHECK-NEXT: [[TMP100]] = tail call float @llvm.fmuladd.f32(float [[TMP84]], float [[TMP85]], float [[DOTPHI26]]) +; CHECK-NEXT: [[TMP101]] = tail call float @llvm.fmuladd.f32(float [[TMP86]], float [[TMP87]], float [[DOTPHI27]]) +; CHECK-NEXT: [[TMP102]] = tail call float @llvm.fmuladd.f32(float [[TMP88]], float [[TMP89]], float [[DOTPHI28]]) +; CHECK-NEXT: [[TMP103]] = tail call float @llvm.fmuladd.f32(float [[TMP90]], float [[TMP91]], float [[DOTPHI29]]) +; CHECK-NEXT: [[TMP104]] = tail call float @llvm.fmuladd.f32(float [[TMP92]], float [[TMP93]], float [[DOTPHI30]]) +; CHECK-NEXT: [[TMP105]] = tail call float @llvm.fmuladd.f32(float [[TMP94]], float [[TMP95]], float [[DOTPHI31]]) +; CHECK-NEXT: [[TMP106]] = tail call float @llvm.fmuladd.f32(float [[TMP96]], float [[TMP97]], float [[DOTPHI32]]) +; CHECK-NEXT: [[TMP107]] = tail call float @llvm.fmuladd.f32(float [[TMP98]], float [[TMP99]], float [[DOTPHI33]]) +; CHECK-NEXT: [[CMP59_NOT_7:%.*]] = icmp ult i32 [[INC66_7]], [[ADD214]] +; CHECK-NEXT: br i1 [[CMP59_NOT_7]], label [[FOR_BODY60_7]], label [[FOR_END67_LOOPEXIT:%.*]] +; CHECK: for.end67.loopexit: +; CHECK-NEXT: [[SUM60:%.*]] = fadd float [[TMP100]], [[TMP101]] +; CHECK-NEXT: [[SUM61:%.*]] = fadd float [[TMP102]], [[TMP103]] +; CHECK-NEXT: [[SUM62:%.*]] = fadd float [[TMP104]], [[TMP105]] +; CHECK-NEXT: [[SUM63:%.*]] = fadd float [[TMP106]], [[TMP107]] +; CHECK-NEXT: [[SUM64:%.*]] = fadd float [[SUM60]], [[SUM61]] +; CHECK-NEXT: [[SUM65:%.*]] = fadd float [[SUM62]], [[SUM63]] +; CHECK-NEXT: [[SUM66:%.*]] = fadd float [[SUM64]], [[SUM65]] +; CHECK-NEXT: br label [[FOR_END16434]] +; CHECK: for.end16434: +; CHECK-NEXT: [[PHI_SUM35:%.*]] = phi i32 [ [[ADD56]], [[FOR_BODY50]] ], [ [[INC66_7]], [[FOR_END67_LOOPEXIT]] ] +; CHECK-NEXT: [[PHI_FLOAT36:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY50]] ], [ [[SUM66]], [[FOR_END67_LOOPEXIT]] ] +; CHECK-NEXT: store float [[PHI_FLOAT36]], ptr [[ARRAYIDX54]], align 4 +; CHECK-NEXT: [[CMP182_NOT58737:%.*]] = icmp ugt i32 [[PHI_SUM35]], [[SUB57]] +; CHECK-NEXT: br i1 [[CMP182_NOT58737]], label [[FOR_END67]], label [[FOR_BODY60_CLONE:%.*]] +; CHECK: for.body60.clone: +; CHECK-NEXT: [[TMP108:%.*]] = phi float [ [[TMP111:%.*]], [[FOR_BODY60_CLONE]] ], [ [[PHI_FLOAT36]], [[FOR_END16434]] ] +; CHECK-NEXT: [[K53_0131_CLONE:%.*]] = phi i32 [ [[INC66_CLONE:%.*]], [[FOR_BODY60_CLONE]] ], [ [[PHI_SUM35]], [[FOR_END16434]] ] +; CHECK-NEXT: [[ARRAYIDX61_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K53_0131_CLONE]] +; CHECK-NEXT: [[TMP109:%.*]] = load float, ptr [[ARRAYIDX61_CLONE]], align 4 +; CHECK-NEXT: [[SUB62_CLONE:%.*]] = sub i32 [[K53_0131_CLONE]], [[ADD56]] +; CHECK-NEXT: [[ARRAYIDX63_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB62_CLONE]] +; CHECK-NEXT: [[TMP110:%.*]] = load float, ptr [[ARRAYIDX63_CLONE]], align 4 +; CHECK-NEXT: [[TMP111]] = tail call float @llvm.fmuladd.f32(float [[TMP109]], float [[TMP110]], float [[TMP108]]) +; CHECK-NEXT: [[INC66_CLONE]] = add i32 [[K53_0131_CLONE]], 1 +; CHECK-NEXT: [[CMP59_NOT_CLONE:%.*]] = icmp ugt i32 [[INC66_CLONE]], [[SUB57]] +; CHECK-NEXT: br i1 [[CMP59_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE38:%.*]], label [[FOR_BODY60_CLONE]] +; CHECK: for.cond.for.end_crit_edge38: +; CHECK-NEXT: store float [[TMP111]], ptr [[ARRAYIDX54]], align 4 +; CHECK-NEXT: br label [[FOR_END67]] ; CHECK: for.end67: ; CHECK-NEXT: [[INC69]] = add nsw i32 [[N44_0133]], 1 ; CHECK-NEXT: [[EXITCOND136_NOT:%.*]] = icmp eq i32 [[INC69]], [[SUB47]] -; CHECK-NEXT: br i1 [[EXITCOND136_NOT]], label [[RETURN]], label [[FOR_BODY50]] +; CHECK-NEXT: br i1 [[EXITCOND136_NOT]], label [[RETURN_LOOPEXIT:%.*]], label [[FOR_BODY50]] +; CHECK: return.loopexit: +; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND45_PREHEADER]] ], [ 0, [[FOR_END67]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND45_PREHEADER]] ], [ 0, [[RETURN_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll index 33a08dfbf9df1c..86f9a334884556 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/conv.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_conv_f32_ansi(ptr noundef readonly %Signal, i32 noundef %siglen, ptr noundef readonly %Kernel, i32 noundef %kernlen, ptr noundef writeonly %convout) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_conv_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noundef writeonly [[CONVOUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noalias noundef readonly [[KERNEL:%.*]], i32 noundef [[KERNLEN:%.*]], ptr noalias noundef writeonly [[CONVOUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[SIGNAL]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[KERNEL]], null @@ -21,34 +21,129 @@ define dso_local noundef i32 @dsps_conv_f32_ansi(ptr noundef readonly %Signal, i ; CHECK-NEXT: [[KERN_0:%.*]] = phi ptr [ [[SIGNAL]], [[IF_THEN8]] ], [ [[KERNEL]], [[IF_END6]] ] ; CHECK-NEXT: [[SIG_0:%.*]] = phi ptr [ [[KERNEL]], [[IF_THEN8]] ], [ [[SIGNAL]], [[IF_END6]] ] ; CHECK-NEXT: [[CMP10120:%.*]] = icmp sgt i32 [[LKERN_0]], 0 -; CHECK-NEXT: br i1 [[CMP10120]], label [[FOR_BODY:%.*]], label [[FOR_COND21_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP10120]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND21_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond21.preheader.loopexit: +; CHECK-NEXT: br label [[FOR_COND21_PREHEADER]] ; CHECK: for.cond21.preheader: ; CHECK-NEXT: [[CMP22125:%.*]] = icmp slt i32 [[LKERN_0]], [[LSIG_0]] -; CHECK-NEXT: br i1 [[CMP22125]], label [[FOR_BODY24:%.*]], label [[FOR_COND42_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP22125]], label [[FOR_BODY24_PREHEADER:%.*]], label [[FOR_COND42_PREHEADER:%.*]] +; CHECK: for.body24.preheader: +; CHECK-NEXT: [[DIV536:%.*]] = and i32 [[LKERN_0]], -16 +; CHECK-NEXT: br label [[FOR_BODY24:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[IF_END9]] ] -; CHECK-NEXT: [[N_0121:%.*]] = phi i32 [ [[INC18:%.*]], [[FOR_END]] ], [ 0, [[IF_END9]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[N_0121:%.*]] = phi i32 [ [[INC18:%.*]], [[FOR_END]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[N_0121]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[N_0121]], 2147483640 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[CONVOUT]], i32 [[N_0121]] -; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: br label [[FOR_BODY13:%.*]] -; CHECK: for.body13: -; CHECK-NEXT: [[K_0119:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY13]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP3:%.*]], [[FOR_BODY13]] ] -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0119]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY13_CLONE_PREHEADER:%.*]], label [[FOR_BODY13_7:%.*]] +; CHECK: for.body13.7: +; CHECK-NEXT: [[K_0119:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC_7:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP19:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI1:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP20:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI2:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI3:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP22:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP23:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI5:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[DOTPHI7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP26:%.*]], [[FOR_BODY13_7]] ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[K_0119]], 1 +; CHECK-NEXT: [[INC_1:%.*]] = add nuw nsw i32 [[K_0119]], 2 +; CHECK-NEXT: [[INC_2:%.*]] = add nuw nsw i32 [[K_0119]], 3 +; CHECK-NEXT: [[INC_3:%.*]] = add nuw nsw i32 [[K_0119]], 4 +; CHECK-NEXT: [[INC_4:%.*]] = add nuw nsw i32 [[K_0119]], 5 +; CHECK-NEXT: [[INC_5:%.*]] = add nuw nsw i32 [[K_0119]], 6 +; CHECK-NEXT: [[INC_6:%.*]] = add nuw nsw i32 [[K_0119]], 7 +; CHECK-NEXT: [[INC_7]] = add nuw nsw i32 [[K_0119]], 8 ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[N_0121]], [[K_0119]] +; CHECK-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[N_0121]], [[INC]] +; CHECK-NEXT: [[SUB_2:%.*]] = sub nsw i32 [[N_0121]], [[INC_1]] +; CHECK-NEXT: [[SUB_3:%.*]] = sub nsw i32 [[N_0121]], [[INC_2]] +; CHECK-NEXT: [[SUB_4:%.*]] = sub nsw i32 [[N_0121]], [[INC_3]] +; CHECK-NEXT: [[SUB_5:%.*]] = sub nsw i32 [[N_0121]], [[INC_4]] +; CHECK-NEXT: [[SUB_6:%.*]] = sub nsw i32 [[N_0121]], [[INC_5]] +; CHECK-NEXT: [[SUB_7:%.*]] = sub nsw i32 [[N_0121]], [[INC_6]] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0119]] ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP3]] = tail call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP2]], float [[TMP0]]) -; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[K_0119]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY13]] +; CHECK-NEXT: [[ARRAYIDX14_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC]] +; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_1]] +; CHECK-NEXT: [[ARRAYIDX14_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_1]] +; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_2]] +; CHECK-NEXT: [[ARRAYIDX14_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_2]] +; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_3]] +; CHECK-NEXT: [[ARRAYIDX14_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_3]] +; CHECK-NEXT: [[ARRAYIDX15_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_4]] +; CHECK-NEXT: [[ARRAYIDX14_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_4]] +; CHECK-NEXT: [[ARRAYIDX15_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_5]] +; CHECK-NEXT: [[ARRAYIDX14_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_5]] +; CHECK-NEXT: [[ARRAYIDX15_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_6]] +; CHECK-NEXT: [[ARRAYIDX14_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[INC_6]] +; CHECK-NEXT: [[ARRAYIDX15_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_7]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX14_1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX15_1]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14_2]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX15_2]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14_3]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15_3]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX14_4]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX15_4]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX14_5]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX15_5]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX14_6]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX15_6]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX14_7]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX15_7]], align 4 +; CHECK-NEXT: [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP3]], float [[TMP4]], float [[DOTPHI]]) +; CHECK-NEXT: [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[DOTPHI1]]) +; CHECK-NEXT: [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP7]], float [[TMP8]], float [[DOTPHI2]]) +; CHECK-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP10]], float [[DOTPHI3]]) +; CHECK-NEXT: [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[DOTPHI4]]) +; CHECK-NEXT: [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP13]], float [[TMP14]], float [[DOTPHI5]]) +; CHECK-NEXT: [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[DOTPHI6]]) +; CHECK-NEXT: [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP17]], float [[TMP18]], float [[DOTPHI7]]) +; CHECK-NEXT: [[EXITCOND_7:%.*]] = icmp ult i32 [[INC_7]], [[TMP1]] +; CHECK-NEXT: br i1 [[EXITCOND_7]], label [[FOR_BODY13_7]], label [[FOR_END8:%.*]] +; CHECK: for.end8: +; CHECK-NEXT: [[SUM:%.*]] = fadd float [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[SUM23:%.*]] = fadd float [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[SUM24:%.*]] = fadd float [[TMP23]], [[TMP24]] +; CHECK-NEXT: [[SUM25:%.*]] = fadd float [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[SUM26:%.*]] = fadd float [[SUM]], [[SUM23]] +; CHECK-NEXT: [[SUM27:%.*]] = fadd float [[SUM24]], [[SUM25]] +; CHECK-NEXT: [[SUM28:%.*]] = fadd float [[SUM26]], [[SUM27]] +; CHECK-NEXT: store float [[SUM28]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br i1 false, label [[FOR_END]], label [[FOR_BODY13_CLONE_PREHEADER]] +; CHECK: for.body13.clone.preheader: +; CHECK-NEXT: [[SUM_PHI:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[SUM28]], [[FOR_END8]] ] +; CHECK-NEXT: [[ADD_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP0]], [[FOR_END8]] ] +; CHECK-NEXT: br label [[FOR_BODY13_CLONE:%.*]] +; CHECK: for.body13.clone: +; CHECK-NEXT: [[K_0119_CLONE:%.*]] = phi i32 [ [[ADD_PHI]], [[FOR_BODY13_CLONE_PREHEADER]] ], [ [[INC_CLONE:%.*]], [[FOR_BODY13_CLONE]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi float [ [[SUM_PHI]], [[FOR_BODY13_CLONE_PREHEADER]] ], [ [[TMP30:%.*]], [[FOR_BODY13_CLONE]] ] +; CHECK-NEXT: [[ARRAYIDX14_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K_0119_CLONE]] +; CHECK-NEXT: [[TMP28:%.*]] = load float, ptr [[ARRAYIDX14_CLONE]], align 4 +; CHECK-NEXT: [[SUB_CLONE:%.*]] = sub nsw i32 [[N_0121]], [[K_0119_CLONE]] +; CHECK-NEXT: [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB_CLONE]] +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4 +; CHECK-NEXT: [[TMP30]] = tail call float @llvm.fmuladd.f32(float [[TMP28]], float [[TMP29]], float [[TMP27]]) +; CHECK-NEXT: [[INC_CLONE]] = add nuw nsw i32 [[K_0119_CLONE]], 1 +; CHECK-NEXT: [[EXITCOND_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[INDVARS_IV]] +; CHECK-NEXT: br i1 [[EXITCOND_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY13_CLONE]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: store float [[TMP30]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[INC18]] = add nuw nsw i32 [[N_0121]], 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND132_NOT:%.*]] = icmp eq i32 [[INC18]], [[LKERN_0]] -; CHECK-NEXT: br i1 [[EXITCOND132_NOT]], label [[FOR_COND21_PREHEADER]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND132_NOT]], label [[FOR_COND21_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.cond42.preheader.loopexit: +; CHECK-NEXT: br label [[FOR_COND42_PREHEADER]] ; CHECK: for.cond42.preheader: ; CHECK-NEXT: [[ADD43:%.*]] = add i32 [[SIGLEN]], -1 ; CHECK-NEXT: [[SUB44:%.*]] = add i32 [[ADD43]], [[KERNLEN]] @@ -58,57 +153,308 @@ define dso_local noundef i32 @dsps_conv_f32_ansi(ptr noundef readonly %Signal, i ; CHECK-NEXT: [[SUB54:%.*]] = add nsw i32 [[LSIG_0]], -1 ; CHECK-NEXT: br label [[FOR_BODY47:%.*]] ; CHECK: for.body24: -; CHECK-NEXT: [[N20_0126:%.*]] = phi i32 [ [[INC39:%.*]], [[FOR_END37:%.*]] ], [ [[LKERN_0]], [[FOR_COND21_PREHEADER]] ] +; CHECK-NEXT: [[N20_0126:%.*]] = phi i32 [ [[INC39:%.*]], [[FOR_END37:%.*]] ], [ [[LKERN_0]], [[FOR_BODY24_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[CONVOUT]], i32 [[N20_0126]] -; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX26]], align 4 ; CHECK-NEXT: [[SUB27:%.*]] = sub nuw nsw i32 [[N20_0126]], [[LKERN_0]] ; CHECK-NEXT: [[K25_0122:%.*]] = add i32 [[SUB27]], 1 -; CHECK-NEXT: [[CMP29_NOT123:%.*]] = icmp ugt i32 [[K25_0122]], [[N20_0126]] -; CHECK-NEXT: br i1 [[CMP29_NOT123]], label [[FOR_END37]], label [[FOR_BODY30:%.*]] -; CHECK: for.body30: -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP7:%.*]], [[FOR_BODY30]] ], [ 0.000000e+00, [[FOR_BODY24]] ] -; CHECK-NEXT: [[K25_0124:%.*]] = phi i32 [ [[K25_0:%.*]], [[FOR_BODY30]] ], [ [[K25_0122]], [[FOR_BODY24]] ] -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0124]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 +; CHECK-NEXT: [[ADD60:%.*]] = add i32 [[K25_0122]], [[DIV536]] +; CHECK-NEXT: [[CMP29_NOT123:%.*]] = icmp ult i32 [[K25_0122]], [[ADD60]] +; CHECK-NEXT: br i1 [[CMP29_NOT123]], label [[FOR_BODY30_PREHEADER:%.*]], label [[FOR_END164:%.*]] +; CHECK: for.body30.preheader: +; CHECK-NEXT: br label [[FOR_BODY30_15:%.*]] +; CHECK: for.body30.15: +; CHECK-NEXT: [[K25_0124:%.*]] = phi i32 [ [[K25_0122]], [[FOR_BODY30_PREHEADER]] ], [ [[K25_0_15:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP63:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI10:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP64:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI11:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP65:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI12:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP66:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI13:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP67:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI14:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP68:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI15:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP69:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI16:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP70:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI17:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP71:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI18:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP72:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI19:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP73:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI20:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP74:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI21:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP75:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI22:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP76:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI23:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP77:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[DOTPHI24:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY30_PREHEADER]] ], [ [[TMP78:%.*]], [[FOR_BODY30_15]] ] +; CHECK-NEXT: [[K25_0:%.*]] = add i32 [[K25_0124]], 1 +; CHECK-NEXT: [[K25_0_1:%.*]] = add i32 [[K25_0124]], 2 +; CHECK-NEXT: [[K25_0_2:%.*]] = add i32 [[K25_0124]], 3 +; CHECK-NEXT: [[K25_0_3:%.*]] = add i32 [[K25_0124]], 4 +; CHECK-NEXT: [[K25_0_4:%.*]] = add i32 [[K25_0124]], 5 +; CHECK-NEXT: [[K25_0_5:%.*]] = add i32 [[K25_0124]], 6 +; CHECK-NEXT: [[K25_0_6:%.*]] = add i32 [[K25_0124]], 7 +; CHECK-NEXT: [[K25_0_7:%.*]] = add i32 [[K25_0124]], 8 +; CHECK-NEXT: [[K25_0_8:%.*]] = add i32 [[K25_0124]], 9 +; CHECK-NEXT: [[K25_0_9:%.*]] = add i32 [[K25_0124]], 10 +; CHECK-NEXT: [[K25_0_10:%.*]] = add i32 [[K25_0124]], 11 +; CHECK-NEXT: [[K25_0_11:%.*]] = add i32 [[K25_0124]], 12 +; CHECK-NEXT: [[K25_0_12:%.*]] = add i32 [[K25_0124]], 13 +; CHECK-NEXT: [[K25_0_13:%.*]] = add i32 [[K25_0124]], 14 +; CHECK-NEXT: [[K25_0_14:%.*]] = add i32 [[K25_0124]], 15 +; CHECK-NEXT: [[K25_0_15]] = add i32 [[K25_0124]], 16 ; CHECK-NEXT: [[SUB32:%.*]] = sub i32 [[N20_0126]], [[K25_0124]] +; CHECK-NEXT: [[SUB32_1:%.*]] = sub i32 [[N20_0126]], [[K25_0]] +; CHECK-NEXT: [[SUB32_2:%.*]] = sub i32 [[N20_0126]], [[K25_0_1]] +; CHECK-NEXT: [[SUB32_3:%.*]] = sub i32 [[N20_0126]], [[K25_0_2]] +; CHECK-NEXT: [[SUB32_4:%.*]] = sub i32 [[N20_0126]], [[K25_0_3]] +; CHECK-NEXT: [[SUB32_5:%.*]] = sub i32 [[N20_0126]], [[K25_0_4]] +; CHECK-NEXT: [[SUB32_6:%.*]] = sub i32 [[N20_0126]], [[K25_0_5]] +; CHECK-NEXT: [[SUB32_7:%.*]] = sub i32 [[N20_0126]], [[K25_0_6]] +; CHECK-NEXT: [[SUB32_8:%.*]] = sub i32 [[N20_0126]], [[K25_0_7]] +; CHECK-NEXT: [[SUB32_9:%.*]] = sub i32 [[N20_0126]], [[K25_0_8]] +; CHECK-NEXT: [[SUB32_10:%.*]] = sub i32 [[N20_0126]], [[K25_0_9]] +; CHECK-NEXT: [[SUB32_11:%.*]] = sub i32 [[N20_0126]], [[K25_0_10]] +; CHECK-NEXT: [[SUB32_12:%.*]] = sub i32 [[N20_0126]], [[K25_0_11]] +; CHECK-NEXT: [[SUB32_13:%.*]] = sub i32 [[N20_0126]], [[K25_0_12]] +; CHECK-NEXT: [[SUB32_14:%.*]] = sub i32 [[N20_0126]], [[K25_0_13]] +; CHECK-NEXT: [[SUB32_15:%.*]] = sub i32 [[N20_0126]], [[K25_0_14]] +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0124]] ; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX33]], align 4 -; CHECK-NEXT: [[TMP7]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[TMP4]]) -; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX26]], align 4 -; CHECK-NEXT: [[K25_0]] = add i32 [[K25_0124]], 1 -; CHECK-NEXT: [[CMP29_NOT:%.*]] = icmp ugt i32 [[K25_0]], [[N20_0126]] -; CHECK-NEXT: br i1 [[CMP29_NOT]], label [[FOR_END37]], label [[FOR_BODY30]] +; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0]] +; CHECK-NEXT: [[ARRAYIDX33_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_1]] +; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_1]] +; CHECK-NEXT: [[ARRAYIDX33_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_2]] +; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_2]] +; CHECK-NEXT: [[ARRAYIDX33_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_3]] +; CHECK-NEXT: [[ARRAYIDX31_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_3]] +; CHECK-NEXT: [[ARRAYIDX33_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_4]] +; CHECK-NEXT: [[ARRAYIDX31_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_4]] +; CHECK-NEXT: [[ARRAYIDX33_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_5]] +; CHECK-NEXT: [[ARRAYIDX31_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_5]] +; CHECK-NEXT: [[ARRAYIDX33_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_6]] +; CHECK-NEXT: [[ARRAYIDX31_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_6]] +; CHECK-NEXT: [[ARRAYIDX33_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_7]] +; CHECK-NEXT: [[ARRAYIDX31_8:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_7]] +; CHECK-NEXT: [[ARRAYIDX33_8:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_8]] +; CHECK-NEXT: [[ARRAYIDX31_9:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_8]] +; CHECK-NEXT: [[ARRAYIDX33_9:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_9]] +; CHECK-NEXT: [[ARRAYIDX31_10:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_9]] +; CHECK-NEXT: [[ARRAYIDX33_10:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_10]] +; CHECK-NEXT: [[ARRAYIDX31_11:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_10]] +; CHECK-NEXT: [[ARRAYIDX33_11:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_11]] +; CHECK-NEXT: [[ARRAYIDX31_12:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_11]] +; CHECK-NEXT: [[ARRAYIDX33_12:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_12]] +; CHECK-NEXT: [[ARRAYIDX31_13:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_12]] +; CHECK-NEXT: [[ARRAYIDX33_13:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_13]] +; CHECK-NEXT: [[ARRAYIDX31_14:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_13]] +; CHECK-NEXT: [[ARRAYIDX33_14:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_14]] +; CHECK-NEXT: [[ARRAYIDX31_15:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0_14]] +; CHECK-NEXT: [[ARRAYIDX33_15:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_15]] +; CHECK-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX31_1]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX33_1]], align 4 +; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[ARRAYIDX31_2]], align 4 +; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[ARRAYIDX33_2]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[ARRAYIDX31_3]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[ARRAYIDX33_3]], align 4 +; CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[ARRAYIDX31_4]], align 4 +; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[ARRAYIDX33_4]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[ARRAYIDX31_5]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[ARRAYIDX33_5]], align 4 +; CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[ARRAYIDX31_6]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[ARRAYIDX33_6]], align 4 +; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[ARRAYIDX31_7]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[ARRAYIDX33_7]], align 4 +; CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[ARRAYIDX31_8]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[ARRAYIDX33_8]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[ARRAYIDX31_9]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = load float, ptr [[ARRAYIDX33_9]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[ARRAYIDX31_10]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[ARRAYIDX33_10]], align 4 +; CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[ARRAYIDX31_11]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = load float, ptr [[ARRAYIDX33_11]], align 4 +; CHECK-NEXT: [[TMP55:%.*]] = load float, ptr [[ARRAYIDX31_12]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = load float, ptr [[ARRAYIDX33_12]], align 4 +; CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[ARRAYIDX31_13]], align 4 +; CHECK-NEXT: [[TMP58:%.*]] = load float, ptr [[ARRAYIDX33_13]], align 4 +; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[ARRAYIDX31_14]], align 4 +; CHECK-NEXT: [[TMP60:%.*]] = load float, ptr [[ARRAYIDX33_14]], align 4 +; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[ARRAYIDX31_15]], align 4 +; CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[ARRAYIDX33_15]], align 4 +; CHECK-NEXT: [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP31]], float [[TMP32]], float [[DOTPHI9]]) +; CHECK-NEXT: [[TMP64]] = tail call float @llvm.fmuladd.f32(float [[TMP33]], float [[TMP34]], float [[DOTPHI10]]) +; CHECK-NEXT: [[TMP65]] = tail call float @llvm.fmuladd.f32(float [[TMP35]], float [[TMP36]], float [[DOTPHI11]]) +; CHECK-NEXT: [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[DOTPHI12]]) +; CHECK-NEXT: [[TMP67]] = tail call float @llvm.fmuladd.f32(float [[TMP39]], float [[TMP40]], float [[DOTPHI13]]) +; CHECK-NEXT: [[TMP68]] = tail call float @llvm.fmuladd.f32(float [[TMP41]], float [[TMP42]], float [[DOTPHI14]]) +; CHECK-NEXT: [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP43]], float [[TMP44]], float [[DOTPHI15]]) +; CHECK-NEXT: [[TMP70]] = tail call float @llvm.fmuladd.f32(float [[TMP45]], float [[TMP46]], float [[DOTPHI16]]) +; CHECK-NEXT: [[TMP71]] = tail call float @llvm.fmuladd.f32(float [[TMP47]], float [[TMP48]], float [[DOTPHI17]]) +; CHECK-NEXT: [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP49]], float [[TMP50]], float [[DOTPHI18]]) +; CHECK-NEXT: [[TMP73]] = tail call float @llvm.fmuladd.f32(float [[TMP51]], float [[TMP52]], float [[DOTPHI19]]) +; CHECK-NEXT: [[TMP74]] = tail call float @llvm.fmuladd.f32(float [[TMP53]], float [[TMP54]], float [[DOTPHI20]]) +; CHECK-NEXT: [[TMP75]] = tail call float @llvm.fmuladd.f32(float [[TMP55]], float [[TMP56]], float [[DOTPHI21]]) +; CHECK-NEXT: [[TMP76]] = tail call float @llvm.fmuladd.f32(float [[TMP57]], float [[TMP58]], float [[DOTPHI22]]) +; CHECK-NEXT: [[TMP77]] = tail call float @llvm.fmuladd.f32(float [[TMP59]], float [[TMP60]], float [[DOTPHI23]]) +; CHECK-NEXT: [[TMP78]] = tail call float @llvm.fmuladd.f32(float [[TMP61]], float [[TMP62]], float [[DOTPHI24]]) +; CHECK-NEXT: [[CMP29_NOT_15:%.*]] = icmp ult i32 [[K25_0_15]], [[ADD60]] +; CHECK-NEXT: br i1 [[CMP29_NOT_15]], label [[FOR_BODY30_15]], label [[FOR_END37_LOOPEXIT:%.*]] +; CHECK: for.end37.loopexit: +; CHECK-NEXT: [[SUM45:%.*]] = fadd float [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[SUM46:%.*]] = fadd float [[TMP65]], [[TMP66]] +; CHECK-NEXT: [[SUM47:%.*]] = fadd float [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[SUM48:%.*]] = fadd float [[TMP69]], [[TMP70]] +; CHECK-NEXT: [[SUM49:%.*]] = fadd float [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[SUM50:%.*]] = fadd float [[TMP73]], [[TMP74]] +; CHECK-NEXT: [[SUM51:%.*]] = fadd float [[TMP75]], [[TMP76]] +; CHECK-NEXT: [[SUM52:%.*]] = fadd float [[TMP77]], [[TMP78]] +; CHECK-NEXT: [[SUM53:%.*]] = fadd float [[SUM45]], [[SUM46]] +; CHECK-NEXT: [[SUM54:%.*]] = fadd float [[SUM47]], [[SUM48]] +; CHECK-NEXT: [[SUM55:%.*]] = fadd float [[SUM49]], [[SUM50]] +; CHECK-NEXT: [[SUM56:%.*]] = fadd float [[SUM51]], [[SUM52]] +; CHECK-NEXT: [[SUM57:%.*]] = fadd float [[SUM53]], [[SUM54]] +; CHECK-NEXT: [[SUM58:%.*]] = fadd float [[SUM55]], [[SUM56]] +; CHECK-NEXT: [[SUM59:%.*]] = fadd float [[SUM57]], [[SUM58]] +; CHECK-NEXT: br label [[FOR_END164]] +; CHECK: for.end164: +; CHECK-NEXT: [[PHI_SUM:%.*]] = phi i32 [ [[K25_0122]], [[FOR_BODY24]] ], [ [[K25_0_15]], [[FOR_END37_LOOPEXIT]] ] +; CHECK-NEXT: [[PHI_FLOAT:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY24]] ], [ [[SUM59]], [[FOR_END37_LOOPEXIT]] ] +; CHECK-NEXT: store float [[PHI_FLOAT]], ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[CMP182_NOT587:%.*]] = icmp ugt i32 [[PHI_SUM]], [[N20_0126]] +; CHECK-NEXT: br i1 [[CMP182_NOT587]], label [[FOR_END37]], label [[FOR_BODY30_CLONE:%.*]] +; CHECK: for.body30.clone: +; CHECK-NEXT: [[TMP79:%.*]] = phi float [ [[TMP82:%.*]], [[FOR_BODY30_CLONE]] ], [ [[PHI_FLOAT]], [[FOR_END164]] ] +; CHECK-NEXT: [[K25_0124_CLONE:%.*]] = phi i32 [ [[K25_0_CLONE:%.*]], [[FOR_BODY30_CLONE]] ], [ [[PHI_SUM]], [[FOR_END164]] ] +; CHECK-NEXT: [[ARRAYIDX31_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K25_0124_CLONE]] +; CHECK-NEXT: [[TMP80:%.*]] = load float, ptr [[ARRAYIDX31_CLONE]], align 4 +; CHECK-NEXT: [[SUB32_CLONE:%.*]] = sub i32 [[N20_0126]], [[K25_0124_CLONE]] +; CHECK-NEXT: [[ARRAYIDX33_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB32_CLONE]] +; CHECK-NEXT: [[TMP81:%.*]] = load float, ptr [[ARRAYIDX33_CLONE]], align 4 +; CHECK-NEXT: [[TMP82]] = tail call float @llvm.fmuladd.f32(float [[TMP80]], float [[TMP81]], float [[TMP79]]) +; CHECK-NEXT: [[K25_0_CLONE]] = add i32 [[K25_0124_CLONE]], 1 +; CHECK-NEXT: [[CMP29_NOT_CLONE:%.*]] = icmp ugt i32 [[K25_0_CLONE]], [[N20_0126]] +; CHECK-NEXT: br i1 [[CMP29_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE25:%.*]], label [[FOR_BODY30_CLONE]] +; CHECK: for.cond.for.end_crit_edge25: +; CHECK-NEXT: store float [[TMP82]], ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: br label [[FOR_END37]] ; CHECK: for.end37: ; CHECK-NEXT: [[INC39]] = add nuw nsw i32 [[N20_0126]], 1 ; CHECK-NEXT: [[EXITCOND133_NOT:%.*]] = icmp eq i32 [[INC39]], [[LSIG_0]] -; CHECK-NEXT: br i1 [[EXITCOND133_NOT]], label [[FOR_COND42_PREHEADER]], label [[FOR_BODY24]] +; CHECK-NEXT: br i1 [[EXITCOND133_NOT]], label [[FOR_COND42_PREHEADER_LOOPEXIT:%.*]], label [[FOR_BODY24]] ; CHECK: for.body47: ; CHECK-NEXT: [[N41_0131:%.*]] = phi i32 [ [[LSIG_0]], [[FOR_BODY47_LR_PH]] ], [ [[INC66:%.*]], [[FOR_END64:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[CONVOUT]], i32 [[N41_0131]] -; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX51]], align 4 ; CHECK-NEXT: [[SUB52:%.*]] = sub nsw i32 [[N41_0131]], [[LKERN_0]] ; CHECK-NEXT: [[K50_0127:%.*]] = add i32 [[SUB52]], 1 -; CHECK-NEXT: [[CMP56_NOT128:%.*]] = icmp ugt i32 [[K50_0127]], [[SUB54]] -; CHECK-NEXT: br i1 [[CMP56_NOT128]], label [[FOR_END64]], label [[FOR_BODY57:%.*]] -; CHECK: for.body57: -; CHECK-NEXT: [[TMP8:%.*]] = phi float [ [[TMP11:%.*]], [[FOR_BODY57]] ], [ 0.000000e+00, [[FOR_BODY47]] ] -; CHECK-NEXT: [[K50_0129:%.*]] = phi i32 [ [[K50_0:%.*]], [[FOR_BODY57]] ], [ [[K50_0127]], [[FOR_BODY47]] ] -; CHECK-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0129]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX58]], align 4 +; CHECK-NEXT: [[ADD207_NEG:%.*]] = xor i32 [[SUB52]], -1 +; CHECK-NEXT: [[ADD211:%.*]] = add i32 [[ADD207_NEG]], [[LSIG_0]] +; CHECK-NEXT: [[DIV212535:%.*]] = and i32 [[ADD211]], -8 +; CHECK-NEXT: [[ADD214:%.*]] = add i32 [[DIV212535]], [[K50_0127]] +; CHECK-NEXT: [[CMP56_NOT128:%.*]] = icmp ult i32 [[K50_0127]], [[ADD214]] +; CHECK-NEXT: br i1 [[CMP56_NOT128]], label [[FOR_BODY57_PREHEADER:%.*]], label [[FOR_END16434:%.*]] +; CHECK: for.body57.preheader: +; CHECK-NEXT: br label [[FOR_BODY57_7:%.*]] +; CHECK: for.body57.7: +; CHECK-NEXT: [[K50_0129:%.*]] = phi i32 [ [[K50_0127]], [[FOR_BODY57_PREHEADER]] ], [ [[K50_0_7:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI26:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP99:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI27:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP100:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI28:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP101:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI29:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP102:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI30:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP103:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI31:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP104:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI32:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP105:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[DOTPHI33:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY57_PREHEADER]] ], [ [[TMP106:%.*]], [[FOR_BODY57_7]] ] +; CHECK-NEXT: [[K50_0:%.*]] = add i32 [[K50_0129]], 1 +; CHECK-NEXT: [[K50_0_1:%.*]] = add i32 [[K50_0129]], 2 +; CHECK-NEXT: [[K50_0_2:%.*]] = add i32 [[K50_0129]], 3 +; CHECK-NEXT: [[K50_0_3:%.*]] = add i32 [[K50_0129]], 4 +; CHECK-NEXT: [[K50_0_4:%.*]] = add i32 [[K50_0129]], 5 +; CHECK-NEXT: [[K50_0_5:%.*]] = add i32 [[K50_0129]], 6 +; CHECK-NEXT: [[K50_0_6:%.*]] = add i32 [[K50_0129]], 7 +; CHECK-NEXT: [[K50_0_7]] = add i32 [[K50_0129]], 8 ; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[N41_0131]], [[K50_0129]] +; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[N41_0131]], [[K50_0]] +; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[N41_0131]], [[K50_0_1]] +; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[N41_0131]], [[K50_0_2]] +; CHECK-NEXT: [[SUB59_4:%.*]] = sub i32 [[N41_0131]], [[K50_0_3]] +; CHECK-NEXT: [[SUB59_5:%.*]] = sub i32 [[N41_0131]], [[K50_0_4]] +; CHECK-NEXT: [[SUB59_6:%.*]] = sub i32 [[N41_0131]], [[K50_0_5]] +; CHECK-NEXT: [[SUB59_7:%.*]] = sub i32 [[N41_0131]], [[K50_0_6]] +; CHECK-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0129]] ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX60]], align 4 -; CHECK-NEXT: [[TMP11]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP10]], float [[TMP8]]) -; CHECK-NEXT: store float [[TMP11]], ptr [[ARRAYIDX51]], align 4 -; CHECK-NEXT: [[K50_0]] = add i32 [[K50_0129]], 1 -; CHECK-NEXT: [[CMP56_NOT:%.*]] = icmp ugt i32 [[K50_0]], [[SUB54]] -; CHECK-NEXT: br i1 [[CMP56_NOT]], label [[FOR_END64]], label [[FOR_BODY57]] +; CHECK-NEXT: [[ARRAYIDX58_1:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0]] +; CHECK-NEXT: [[ARRAYIDX60_1:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_1]] +; CHECK-NEXT: [[ARRAYIDX58_2:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_1]] +; CHECK-NEXT: [[ARRAYIDX60_2:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_2]] +; CHECK-NEXT: [[ARRAYIDX58_3:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_2]] +; CHECK-NEXT: [[ARRAYIDX60_3:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_3]] +; CHECK-NEXT: [[ARRAYIDX58_4:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_3]] +; CHECK-NEXT: [[ARRAYIDX60_4:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_4]] +; CHECK-NEXT: [[ARRAYIDX58_5:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_4]] +; CHECK-NEXT: [[ARRAYIDX60_5:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_5]] +; CHECK-NEXT: [[ARRAYIDX58_6:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_5]] +; CHECK-NEXT: [[ARRAYIDX60_6:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_6]] +; CHECK-NEXT: [[ARRAYIDX58_7:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0_6]] +; CHECK-NEXT: [[ARRAYIDX60_7:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_7]] +; CHECK-NEXT: [[TMP83:%.*]] = load float, ptr [[ARRAYIDX58]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = load float, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[TMP85:%.*]] = load float, ptr [[ARRAYIDX58_1]], align 4 +; CHECK-NEXT: [[TMP86:%.*]] = load float, ptr [[ARRAYIDX60_1]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = load float, ptr [[ARRAYIDX58_2]], align 4 +; CHECK-NEXT: [[TMP88:%.*]] = load float, ptr [[ARRAYIDX60_2]], align 4 +; CHECK-NEXT: [[TMP89:%.*]] = load float, ptr [[ARRAYIDX58_3]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = load float, ptr [[ARRAYIDX60_3]], align 4 +; CHECK-NEXT: [[TMP91:%.*]] = load float, ptr [[ARRAYIDX58_4]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = load float, ptr [[ARRAYIDX60_4]], align 4 +; CHECK-NEXT: [[TMP93:%.*]] = load float, ptr [[ARRAYIDX58_5]], align 4 +; CHECK-NEXT: [[TMP94:%.*]] = load float, ptr [[ARRAYIDX60_5]], align 4 +; CHECK-NEXT: [[TMP95:%.*]] = load float, ptr [[ARRAYIDX58_6]], align 4 +; CHECK-NEXT: [[TMP96:%.*]] = load float, ptr [[ARRAYIDX60_6]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = load float, ptr [[ARRAYIDX58_7]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = load float, ptr [[ARRAYIDX60_7]], align 4 +; CHECK-NEXT: [[TMP99]] = tail call float @llvm.fmuladd.f32(float [[TMP83]], float [[TMP84]], float [[DOTPHI26]]) +; CHECK-NEXT: [[TMP100]] = tail call float @llvm.fmuladd.f32(float [[TMP85]], float [[TMP86]], float [[DOTPHI27]]) +; CHECK-NEXT: [[TMP101]] = tail call float @llvm.fmuladd.f32(float [[TMP87]], float [[TMP88]], float [[DOTPHI28]]) +; CHECK-NEXT: [[TMP102]] = tail call float @llvm.fmuladd.f32(float [[TMP89]], float [[TMP90]], float [[DOTPHI29]]) +; CHECK-NEXT: [[TMP103]] = tail call float @llvm.fmuladd.f32(float [[TMP91]], float [[TMP92]], float [[DOTPHI30]]) +; CHECK-NEXT: [[TMP104]] = tail call float @llvm.fmuladd.f32(float [[TMP93]], float [[TMP94]], float [[DOTPHI31]]) +; CHECK-NEXT: [[TMP105]] = tail call float @llvm.fmuladd.f32(float [[TMP95]], float [[TMP96]], float [[DOTPHI32]]) +; CHECK-NEXT: [[TMP106]] = tail call float @llvm.fmuladd.f32(float [[TMP97]], float [[TMP98]], float [[DOTPHI33]]) +; CHECK-NEXT: [[CMP56_NOT_7:%.*]] = icmp ult i32 [[K50_0_7]], [[ADD214]] +; CHECK-NEXT: br i1 [[CMP56_NOT_7]], label [[FOR_BODY57_7]], label [[FOR_END64_LOOPEXIT:%.*]] +; CHECK: for.end64.loopexit: +; CHECK-NEXT: [[SUM60:%.*]] = fadd float [[TMP99]], [[TMP100]] +; CHECK-NEXT: [[SUM61:%.*]] = fadd float [[TMP101]], [[TMP102]] +; CHECK-NEXT: [[SUM62:%.*]] = fadd float [[TMP103]], [[TMP104]] +; CHECK-NEXT: [[SUM63:%.*]] = fadd float [[TMP105]], [[TMP106]] +; CHECK-NEXT: [[SUM64:%.*]] = fadd float [[SUM60]], [[SUM61]] +; CHECK-NEXT: [[SUM65:%.*]] = fadd float [[SUM62]], [[SUM63]] +; CHECK-NEXT: [[SUM66:%.*]] = fadd float [[SUM64]], [[SUM65]] +; CHECK-NEXT: br label [[FOR_END16434]] +; CHECK: for.end16434: +; CHECK-NEXT: [[PHI_SUM35:%.*]] = phi i32 [ [[K50_0127]], [[FOR_BODY47]] ], [ [[K50_0_7]], [[FOR_END64_LOOPEXIT]] ] +; CHECK-NEXT: [[PHI_FLOAT36:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY47]] ], [ [[SUM66]], [[FOR_END64_LOOPEXIT]] ] +; CHECK-NEXT: store float [[PHI_FLOAT36]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[CMP182_NOT58737:%.*]] = icmp ugt i32 [[PHI_SUM35]], [[SUB54]] +; CHECK-NEXT: br i1 [[CMP182_NOT58737]], label [[FOR_END64]], label [[FOR_BODY57_CLONE:%.*]] +; CHECK: for.body57.clone: +; CHECK-NEXT: [[TMP107:%.*]] = phi float [ [[TMP110:%.*]], [[FOR_BODY57_CLONE]] ], [ [[PHI_FLOAT36]], [[FOR_END16434]] ] +; CHECK-NEXT: [[K50_0129_CLONE:%.*]] = phi i32 [ [[K50_0_CLONE:%.*]], [[FOR_BODY57_CLONE]] ], [ [[PHI_SUM35]], [[FOR_END16434]] ] +; CHECK-NEXT: [[ARRAYIDX58_CLONE:%.*]] = getelementptr inbounds float, ptr [[SIG_0]], i32 [[K50_0129_CLONE]] +; CHECK-NEXT: [[TMP108:%.*]] = load float, ptr [[ARRAYIDX58_CLONE]], align 4 +; CHECK-NEXT: [[SUB59_CLONE:%.*]] = sub i32 [[N41_0131]], [[K50_0129_CLONE]] +; CHECK-NEXT: [[ARRAYIDX60_CLONE:%.*]] = getelementptr inbounds float, ptr [[KERN_0]], i32 [[SUB59_CLONE]] +; CHECK-NEXT: [[TMP109:%.*]] = load float, ptr [[ARRAYIDX60_CLONE]], align 4 +; CHECK-NEXT: [[TMP110]] = tail call float @llvm.fmuladd.f32(float [[TMP108]], float [[TMP109]], float [[TMP107]]) +; CHECK-NEXT: [[K50_0_CLONE]] = add i32 [[K50_0129_CLONE]], 1 +; CHECK-NEXT: [[CMP56_NOT_CLONE:%.*]] = icmp ugt i32 [[K50_0_CLONE]], [[SUB54]] +; CHECK-NEXT: br i1 [[CMP56_NOT_CLONE]], label [[FOR_COND_FOR_END_CRIT_EDGE38:%.*]], label [[FOR_BODY57_CLONE]] +; CHECK: for.cond.for.end_crit_edge38: +; CHECK-NEXT: store float [[TMP110]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: br label [[FOR_END64]] ; CHECK: for.end64: ; CHECK-NEXT: [[INC66]] = add nsw i32 [[N41_0131]], 1 ; CHECK-NEXT: [[EXITCOND134_NOT:%.*]] = icmp eq i32 [[INC66]], [[SUB44]] -; CHECK-NEXT: br i1 [[EXITCOND134_NOT]], label [[RETURN]], label [[FOR_BODY47]] +; CHECK-NEXT: br i1 [[EXITCOND134_NOT]], label [[RETURN_LOOPEXIT:%.*]], label [[FOR_BODY47]] +; CHECK: return.loopexit: +; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND42_PREHEADER]] ], [ 0, [[FOR_END64]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND42_PREHEADER]] ], [ 0, [[RETURN_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll index cd8f939112a541..3091bef36bf897 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/corr.ll @@ -1,9 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_corr_f32_ansi(ptr noundef readonly %Signal, i32 noundef %siglen, ptr noundef readonly %Pattern, i32 noundef %patlen, ptr noundef writeonly %dest) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_corr_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noundef readonly [[PATTERN:%.*]], i32 noundef [[PATLEN:%.*]], ptr noundef writeonly [[DEST:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[SIGNAL:%.*]], i32 noundef [[SIGLEN:%.*]], ptr noalias noundef readonly [[PATTERN:%.*]], i32 noundef [[PATLEN:%.*]], ptr noalias noundef writeonly [[DEST:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[PATLEN_NEG:%.*]] = sub i32 0, [[PATLEN]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[SIGNAL]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[PATTERN]], null ; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]] @@ -11,39 +12,232 @@ define dso_local noundef i32 @dsps_corr_f32_ansi(ptr noundef readonly %Signal, i ; CHECK-NEXT: [[OR_COND33:%.*]] = or i1 [[OR_COND]], [[CMP4]] ; CHECK-NEXT: [[CMP7:%.*]] = icmp slt i32 [[SIGLEN]], [[PATLEN]] ; CHECK-NEXT: [[OR_COND34:%.*]] = or i1 [[CMP7]], [[OR_COND33]] -; CHECK-NEXT: br i1 [[OR_COND34]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]] -; CHECK: for.cond.preheader: +; CHECK-NEXT: br i1 [[OR_COND34]], label [[RETURN:%.*]], label [[IF_END:%.*]] +; CHECK: if.end: ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[SIGLEN]], [[PATLEN]] -; CHECK-NEXT: [[CMP1235_NOT:%.*]] = icmp eq i32 [[PATLEN]], 0 -; CHECK-NEXT: br i1 [[CMP1235_NOT]], label [[FOR_COND11_PREHEADER_PREHEADER:%.*]], label [[FOR_COND11_PREHEADER_US:%.*]] +; CHECK-NEXT: [[SUB6:%.*]] = add nsw i32 [[SUB]], -15 +; CHECK-NEXT: [[CMP1235_NOT:%.*]] = icmp sgt i32 [[SUB]], 15 +; CHECK-NEXT: br i1 [[CMP1235_NOT]], label [[FOR_COND8_PREHEADER_LR_PH:%.*]], label [[FOR_COND91_PREHEADER:%.*]] +; CHECK: for.cond8.preheader.lr.ph: +; CHECK-NEXT: [[CMP9242:%.*]] = icmp sgt i32 [[PATLEN]], 0 +; CHECK-NEXT: [[SCEVGEP62:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 60 +; CHECK-NEXT: [[SCEVGEP66:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 56 +; CHECK-NEXT: [[SCEVGEP68:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 52 +; CHECK-NEXT: [[SCEVGEP70:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 48 +; CHECK-NEXT: [[SCEVGEP72:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 44 +; CHECK-NEXT: [[SCEVGEP74:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 40 +; CHECK-NEXT: [[SCEVGEP76:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 36 +; CHECK-NEXT: [[SCEVGEP78:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 32 +; CHECK-NEXT: [[SCEVGEP80:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 28 +; CHECK-NEXT: [[SCEVGEP82:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 24 +; CHECK-NEXT: [[SCEVGEP84:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 20 +; CHECK-NEXT: [[SCEVGEP86:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 16 +; CHECK-NEXT: [[SCEVGEP88:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 12 +; CHECK-NEXT: [[SCEVGEP90:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 8 +; CHECK-NEXT: [[SCEVGEP92:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 4 +; CHECK-NEXT: br label [[FOR_COND8_PREHEADER:%.*]] +; CHECK: for.cond8.preheader: +; CHECK-NEXT: [[LSR_IV95:%.*]] = phi ptr [ [[SCEVGEP96:%.*]], [[FOR_COND_CLEANUP:%.*]] ], [ [[SIGNAL]], [[FOR_COND8_PREHEADER_LR_PH]] ] +; CHECK-NEXT: [[N_0276:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_LR_PH]] ], [ [[ADD89:%.*]], [[FOR_COND_CLEANUP]] ] +; CHECK-NEXT: br i1 [[CMP9242]], label [[FOR_BODY10_LR_PH:%.*]], label [[FOR_COND_CLEANUP]] +; CHECK: for.body10.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY14_US_UNROLL:%.*]] +; CHECK: for.cond91.preheader.loopexit: +; CHECK-NEXT: br label [[FOR_COND91_PREHEADER]] +; CHECK: for.cond91.preheader: +; CHECK-NEXT: [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[ADD89]], [[FOR_COND91_PREHEADER_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[CMP92_NOT282:%.*]] = icmp sgt i32 [[N_0_LCSSA]], [[SUB]] +; CHECK-NEXT: br i1 [[CMP92_NOT282]], label [[RETURN]], label [[FOR_COND95_PREHEADER_LR_PH:%.*]] +; CHECK: for.cond95.preheader.lr.ph: +; CHECK-NEXT: [[CMP92678:%.*]] = icmp sgt i32 [[PATLEN]], 0 +; CHECK-NEXT: br i1 [[CMP92678]], label [[FOR_COND11_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND11_PREHEADER_PREHEADER:%.*]] ; CHECK: for.cond11.preheader.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[SIGLEN]], 2 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 4 -; CHECK-NEXT: tail call void @llvm.memset.p0.i32(ptr nonnull align 4 [[DEST]], i8 0, i32 [[TMP1]], i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[N_0_LCSSA]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST]], i32 [[TMP0]] +; CHECK-NEXT: [[N_0_LCSSA_NEG:%.*]] = sub i32 0, [[N_0_LCSSA]] +; CHECK-NEXT: [[DOTNEG:%.*]] = add i32 [[SIGLEN]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[DOTNEG]], [[PATLEN_NEG]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[N_0_LCSSA_NEG]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i32(ptr nonnull align 4 [[SCEVGEP]], i8 0, i32 [[TMP3]], i1 false) ; CHECK-NEXT: br label [[RETURN]] +; CHECK: for.cond11.preheader.us.preheader: +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SIGLEN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[PATLEN_NEG]] +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[N_0_LCSSA]], 2 +; CHECK-NEXT: [[SCEVGEP102:%.*]] = getelementptr i8, ptr [[SIGNAL]], i32 [[TMP6]] +; CHECK-NEXT: br label [[FOR_COND11_PREHEADER_US:%.*]] ; CHECK: for.cond11.preheader.us: -; CHECK-NEXT: [[N_038_US:%.*]] = phi i32 [ [[INC18_US:%.*]], [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[SIGNAL]], i32 [[N_038_US]] +; CHECK-NEXT: [[LSR_IV103:%.*]] = phi ptr [ [[SCEVGEP104:%.*]], [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US:%.*]] ], [ [[SCEVGEP102]], [[FOR_COND11_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[N_038_US:%.*]] = phi i32 [ [[INC18_US:%.*]], [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US]] ], [ [[N_0_LCSSA]], [[FOR_COND11_PREHEADER_US_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY14_US:%.*]] ; CHECK: for.body14.us: -; CHECK-NEXT: [[M_037_US:%.*]] = phi i32 [ 0, [[FOR_COND11_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY14_US]] ] -; CHECK-NEXT: [[K_CORR_036_US:%.*]] = phi float [ 0.000000e+00, [[FOR_COND11_PREHEADER_US]] ], [ [[TMP5:%.*]], [[FOR_BODY14_US]] ] -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr float, ptr [[TMP2]], i32 [[M_037_US]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4 -; CHECK-NEXT: [[ARRAYIDX15_US:%.*]] = getelementptr inbounds float, ptr [[PATTERN]], i32 [[M_037_US]] -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX15_US]], align 4 -; CHECK-NEXT: [[TMP5]] = tail call float @llvm.fmuladd.f32(float [[TMP3]], float [[TMP4]], float [[K_CORR_036_US]]) -; CHECK-NEXT: [[INC_US]] = add nuw i32 [[M_037_US]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_US]], [[PATLEN]] +; CHECK-NEXT: [[LSR_IV105:%.*]] = phi ptr [ [[SCEVGEP106:%.*]], [[FOR_BODY14_US]] ], [ [[LSR_IV103]], [[FOR_COND11_PREHEADER_US]] ] +; CHECK-NEXT: [[LSR_IV100:%.*]] = phi ptr [ [[SCEVGEP101:%.*]], [[FOR_BODY14_US]] ], [ [[PATTERN]], [[FOR_COND11_PREHEADER_US]] ] +; CHECK-NEXT: [[LSR_IV98:%.*]] = phi i32 [ [[LSR_IV_NEXT99:%.*]], [[FOR_BODY14_US]] ], [ [[PATLEN]], [[FOR_COND11_PREHEADER_US]] ] +; CHECK-NEXT: [[K_CORR_036_US:%.*]] = phi float [ 0.000000e+00, [[FOR_COND11_PREHEADER_US]] ], [ [[TMP9:%.*]], [[FOR_BODY14_US]] ] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[LSR_IV105]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[LSR_IV100]], align 4 +; CHECK-NEXT: [[TMP9]] = tail call float @llvm.fmuladd.f32(float [[TMP7]], float [[TMP8]], float [[K_CORR_036_US]]) +; CHECK-NEXT: [[LSR_IV_NEXT99]] = add i32 [[LSR_IV98]], -1 +; CHECK-NEXT: [[SCEVGEP101]] = getelementptr i8, ptr [[LSR_IV100]], i32 4 +; CHECK-NEXT: [[SCEVGEP106]] = getelementptr i8, ptr [[LSR_IV105]], i32 4 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT99]], 0 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US]], label [[FOR_BODY14_US]] ; CHECK: for.cond11.for.cond.cleanup13_crit_edge.us: ; CHECK-NEXT: [[ARRAYIDX16_US:%.*]] = getelementptr inbounds float, ptr [[DEST]], i32 [[N_038_US]] -; CHECK-NEXT: store float [[TMP5]], ptr [[ARRAYIDX16_US]], align 4 +; CHECK-NEXT: store float [[TMP9]], ptr [[ARRAYIDX16_US]], align 4 ; CHECK-NEXT: [[INC18_US]] = add nuw i32 [[N_038_US]], 1 -; CHECK-NEXT: [[CMP10_NOT_US_NOT:%.*]] = icmp ult i32 [[N_038_US]], [[SUB]] -; CHECK-NEXT: br i1 [[CMP10_NOT_US_NOT]], label [[FOR_COND11_PREHEADER_US]], label [[RETURN]] +; CHECK-NEXT: [[SCEVGEP104]] = getelementptr i8, ptr [[LSR_IV103]], i32 4 +; CHECK-NEXT: [[CMP10_NOT_US_NOT:%.*]] = icmp eq i32 [[INC18_US]], [[TMP5]] +; CHECK-NEXT: br i1 [[CMP10_NOT_US_NOT]], label [[RETURN_LOOPEXIT:%.*]], label [[FOR_COND11_PREHEADER_US]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[TMP10:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP58:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP59:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP60:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP61:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP62:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP63:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP16:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP64:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP65:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP67:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP20:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP68:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP69:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP22:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP70:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP23:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP71:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP24:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP72:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP25:%.*]] = phi float [ 0.000000e+00, [[FOR_COND8_PREHEADER]] ], [ [[TMP73:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD89]] = add nuw nsw i32 [[N_0276]], 16 +; CHECK-NEXT: [[ADD:%.*]] = or disjoint i32 [[N_0276]], 1 +; CHECK-NEXT: [[ADD17:%.*]] = or disjoint i32 [[N_0276]], 2 +; CHECK-NEXT: [[ADD19:%.*]] = or disjoint i32 [[N_0276]], 3 +; CHECK-NEXT: [[ADD21:%.*]] = or disjoint i32 [[N_0276]], 4 +; CHECK-NEXT: [[ADD23:%.*]] = or disjoint i32 [[N_0276]], 5 +; CHECK-NEXT: [[ADD25:%.*]] = or disjoint i32 [[N_0276]], 6 +; CHECK-NEXT: [[ADD27:%.*]] = or disjoint i32 [[N_0276]], 7 +; CHECK-NEXT: [[ADD29:%.*]] = or disjoint i32 [[N_0276]], 8 +; CHECK-NEXT: [[ADD31:%.*]] = or disjoint i32 [[N_0276]], 9 +; CHECK-NEXT: [[ADD33:%.*]] = or disjoint i32 [[N_0276]], 10 +; CHECK-NEXT: [[ADD35:%.*]] = or disjoint i32 [[N_0276]], 11 +; CHECK-NEXT: [[ADD37:%.*]] = or disjoint i32 [[N_0276]], 12 +; CHECK-NEXT: [[ADD39:%.*]] = or disjoint i32 [[N_0276]], 13 +; CHECK-NEXT: [[ADD41:%.*]] = or disjoint i32 [[N_0276]], 14 +; CHECK-NEXT: [[ADD43:%.*]] = or disjoint i32 [[N_0276]], 15 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr float, ptr [[DEST]], i32 [[N_0276]] +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD17]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD19]] +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD21]] +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD23]] +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD25]] +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD27]] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD29]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD31]] +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD33]] +; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD35]] +; CHECK-NEXT: [[ARRAYIDX38:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD37]] +; CHECK-NEXT: [[ARRAYIDX40:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD39]] +; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD41]] +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr float, ptr [[DEST]], i32 [[ADD43]] +; CHECK-NEXT: store float [[TMP10]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: store float [[TMP11]], ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: store float [[TMP12]], ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: store float [[TMP13]], ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: store float [[TMP14]], ptr [[ARRAYIDX22]], align 4 +; CHECK-NEXT: store float [[TMP15]], ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: store float [[TMP16]], ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: store float [[TMP17]], ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: store float [[TMP18]], ptr [[ARRAYIDX30]], align 4 +; CHECK-NEXT: store float [[TMP19]], ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: store float [[TMP20]], ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: store float [[TMP21]], ptr [[ARRAYIDX36]], align 4 +; CHECK-NEXT: store float [[TMP22]], ptr [[ARRAYIDX38]], align 4 +; CHECK-NEXT: store float [[TMP23]], ptr [[ARRAYIDX40]], align 4 +; CHECK-NEXT: store float [[TMP24]], ptr [[ARRAYIDX42]], align 4 +; CHECK-NEXT: store float [[TMP25]], ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[SCEVGEP96]] = getelementptr i8, ptr [[LSR_IV95]], i32 64 +; CHECK-NEXT: [[CMP745:%.*]] = icmp slt i32 [[ADD89]], [[SUB6]] +; CHECK-NEXT: br i1 [[CMP745]], label [[FOR_COND8_PREHEADER]], label [[FOR_COND91_PREHEADER_LOOPEXIT]] +; CHECK: for.body14.us.unroll: +; CHECK-NEXT: [[LSR_IV63:%.*]] = phi i32 [ 0, [[FOR_BODY10_LR_PH]] ], [ [[LSR_IV_NEXT64:%.*]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[PATLEN]], [[FOR_BODY10_LR_PH]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[K_CORR_036_US_UNROLL:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP58]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP26:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP59]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP60]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP61]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP62]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP30:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP63]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP31:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP64]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP32:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP65]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP66]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP67]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP35:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP68]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP36:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP69]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP37:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP70]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP38:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP71]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP72]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[TMP40:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY10_LR_PH]] ], [ [[TMP73]], [[FOR_BODY14_US_UNROLL]] ] +; CHECK-NEXT: [[SCEVGEP94:%.*]] = getelementptr i8, ptr [[PATTERN]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP97:%.*]] = getelementptr i8, ptr [[LSR_IV95]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i8, ptr [[SCEVGEP92]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP91:%.*]] = getelementptr i8, ptr [[SCEVGEP90]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP89:%.*]] = getelementptr i8, ptr [[SCEVGEP88]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP87:%.*]] = getelementptr i8, ptr [[SCEVGEP86]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP85:%.*]] = getelementptr i8, ptr [[SCEVGEP84]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP83:%.*]] = getelementptr i8, ptr [[SCEVGEP82]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP81:%.*]] = getelementptr i8, ptr [[SCEVGEP80]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP79:%.*]] = getelementptr i8, ptr [[SCEVGEP78]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP77:%.*]] = getelementptr i8, ptr [[SCEVGEP76]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP75:%.*]] = getelementptr i8, ptr [[SCEVGEP74]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP73:%.*]] = getelementptr i8, ptr [[SCEVGEP72]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP71:%.*]] = getelementptr i8, ptr [[SCEVGEP70]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP69:%.*]] = getelementptr i8, ptr [[SCEVGEP68]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP67:%.*]] = getelementptr i8, ptr [[SCEVGEP66]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[SCEVGEP65:%.*]] = getelementptr i8, ptr [[SCEVGEP62]], i32 [[LSR_IV63]] +; CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[SCEVGEP94]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[SCEVGEP97]], align 4 +; CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[SCEVGEP93]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[SCEVGEP91]], align 4 +; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[SCEVGEP89]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[SCEVGEP87]], align 4 +; CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[SCEVGEP85]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[SCEVGEP83]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[SCEVGEP81]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = load float, ptr [[SCEVGEP79]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[SCEVGEP77]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCEVGEP75]], align 4 +; CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[SCEVGEP73]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = load float, ptr [[SCEVGEP71]], align 4 +; CHECK-NEXT: [[TMP55:%.*]] = load float, ptr [[SCEVGEP69]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = load float, ptr [[SCEVGEP67]], align 4 +; CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCEVGEP65]], align 4 +; CHECK-NEXT: [[TMP58]] = tail call float @llvm.fmuladd.f32(float [[TMP42]], float [[TMP41]], float [[K_CORR_036_US_UNROLL]]) +; CHECK-NEXT: [[TMP59]] = tail call float @llvm.fmuladd.f32(float [[TMP43]], float [[TMP41]], float [[TMP26]]) +; CHECK-NEXT: [[TMP60]] = tail call float @llvm.fmuladd.f32(float [[TMP44]], float [[TMP41]], float [[TMP27]]) +; CHECK-NEXT: [[TMP61]] = tail call float @llvm.fmuladd.f32(float [[TMP45]], float [[TMP41]], float [[TMP28]]) +; CHECK-NEXT: [[TMP62]] = tail call float @llvm.fmuladd.f32(float [[TMP46]], float [[TMP41]], float [[TMP29]]) +; CHECK-NEXT: [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP47]], float [[TMP41]], float [[TMP30]]) +; CHECK-NEXT: [[TMP64]] = tail call float @llvm.fmuladd.f32(float [[TMP48]], float [[TMP41]], float [[TMP31]]) +; CHECK-NEXT: [[TMP65]] = tail call float @llvm.fmuladd.f32(float [[TMP49]], float [[TMP41]], float [[TMP32]]) +; CHECK-NEXT: [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP50]], float [[TMP41]], float [[TMP33]]) +; CHECK-NEXT: [[TMP67]] = tail call float @llvm.fmuladd.f32(float [[TMP51]], float [[TMP41]], float [[TMP34]]) +; CHECK-NEXT: [[TMP68]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP41]], float [[TMP35]]) +; CHECK-NEXT: [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP53]], float [[TMP41]], float [[TMP36]]) +; CHECK-NEXT: [[TMP70]] = tail call float @llvm.fmuladd.f32(float [[TMP54]], float [[TMP41]], float [[TMP37]]) +; CHECK-NEXT: [[TMP71]] = tail call float @llvm.fmuladd.f32(float [[TMP55]], float [[TMP41]], float [[TMP38]]) +; CHECK-NEXT: [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP56]], float [[TMP41]], float [[TMP39]]) +; CHECK-NEXT: [[TMP73]] = tail call float @llvm.fmuladd.f32(float [[TMP57]], float [[TMP41]], float [[TMP40]]) +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-NEXT: [[LSR_IV_NEXT64]] = add nuw i32 [[LSR_IV63]], 4 +; CHECK-NEXT: [[EXITCOND_NOT_UNROLL:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EXITCOND_NOT_UNROLL]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY14_US_UNROLL]] +; CHECK: return.loopexit: +; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND11_PREHEADER_PREHEADER]] ], [ 0, [[FOR_COND11_FOR_COND_CLEANUP13_CRIT_EDGE_US]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND11_PREHEADER_PREHEADER]] ], [ 0, [[FOR_COND91_PREHEADER]] ], [ 0, [[RETURN_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll index 2fe5f8edd108cc..af95e0500cf2c2 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod.ll @@ -1,37 +1,126 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_dotprod_f32_ansi(ptr nocapture noundef readonly %src1, ptr nocapture noundef readonly %src2, ptr nocapture noundef writeonly %dest, i32 noundef %len) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_dotprod_f32_ansi( -; CHECK-SAME: ptr nocapture noundef readonly [[SRC1:%.*]], ptr nocapture noundef readonly [[SRC2:%.*]], ptr nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias nocapture noundef readonly [[SRC1:%.*]], ptr noalias nocapture noundef readonly [[SRC2:%.*]], ptr noalias nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[TMP0]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[TMP0]], label [[FOR_COND_PREHEADER1:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: -; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[LEN]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]] +; CHECK-NEXT: [[CMP47110:%.*]] = icmp sgt i32 [[LEN]], 0 +; CHECK-NEXT: br i1 [[CMP47110]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: -; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[ADD44:%.*]], [[FOR_END37:%.*]] ], [ [[TMP31:%.*]], [[FOR_BODY_CLONE]] ] ; CHECK-NEXT: store float [[ACC_0_LCSSA]], ptr [[DEST]], align 4 ; CHECK-NEXT: ret i32 0 +; CHECK: for.cond.preheader1: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[LEN]], -7 +; CHECK-NEXT: [[CMP1113:%.*]] = icmp ugt i32 [[LEN]], 7 +; CHECK-NEXT: br i1 [[CMP1113]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND31_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LEN]], 2147483640 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond31.preheader: +; CHECK-NEXT: [[ACC0_0_LCSSA:%.*]] = phi float [ [[TMP4:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC1_0_LCSSA:%.*]] = phi float [ [[TMP7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC2_0_LCSSA:%.*]] = phi float [ [[TMP10:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC3_0_LCSSA:%.*]] = phi float [ [[TMP13:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC4_0_LCSSA:%.*]] = phi float [ [[TMP16:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC5_0_LCSSA:%.*]] = phi float [ [[TMP19:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC6_0_LCSSA:%.*]] = phi float [ [[TMP22:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC7_0_LCSSA:%.*]] = phi float [ [[TMP25:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER1]] ], [ [[TMP1]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP32132:%.*]] = icmp slt i32 [[I_0_LCSSA]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP32132]], label [[FOR_BODY33:%.*]], label [[FOR_END37]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ACC_07:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_08]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_08]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP3]] = tail call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP2]], float [[ACC_07]]) -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END]], label [[FOR_BODY]] +; CHECK-NEXT: [[I_0122:%.*]] = phi i32 [ [[ADD30:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC_07:%.*]] = phi float [ [[TMP4]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC1:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC2:%.*]] = phi float [ [[TMP10]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC3:%.*]] = phi float [ [[TMP13]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC4:%.*]] = phi float [ [[TMP16]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC5:%.*]] = phi float [ [[TMP19]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC6:%.*]] = phi float [ [[TMP22]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC7:%.*]] = phi float [ [[TMP25]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_0122]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_0122]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP4]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[ACC_07]]) +; CHECK-NEXT: [[ADD1:%.*]] = or disjoint i32 [[I_0122]], 1 +; CHECK-NEXT: [[ARRAYIDX1_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD1]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX1_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD1]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1_1]], align 4 +; CHECK-NEXT: [[TMP7]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[ACC1]]) +; CHECK-NEXT: [[ADD2:%.*]] = or disjoint i32 [[I_0122]], 2 +; CHECK-NEXT: [[ARRAYIDX2_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD2]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD2]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2_1]], align 4 +; CHECK-NEXT: [[TMP10]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC2]]) +; CHECK-NEXT: [[ADD3:%.*]] = or disjoint i32 [[I_0122]], 3 +; CHECK-NEXT: [[ARRAYIDX3_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD3]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX3_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD3]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4 +; CHECK-NEXT: [[TMP13]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[ACC3]]) +; CHECK-NEXT: [[ADD4:%.*]] = or disjoint i32 [[I_0122]], 4 +; CHECK-NEXT: [[ARRAYIDX4_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD4]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD4]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX4_1]], align 4 +; CHECK-NEXT: [[TMP16]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[ACC4]]) +; CHECK-NEXT: [[ADD5:%.*]] = or disjoint i32 [[I_0122]], 5 +; CHECK-NEXT: [[ARRAYIDX5_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX5_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX5_1]], align 4 +; CHECK-NEXT: [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP17]], float [[TMP18]], float [[ACC5]]) +; CHECK-NEXT: [[ADD6:%.*]] = or disjoint i32 [[I_0122]], 6 +; CHECK-NEXT: [[ARRAYIDX6_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD6]] +; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD6]] +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6_1]], align 4 +; CHECK-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC6]]) +; CHECK-NEXT: [[ADD7:%.*]] = or disjoint i32 [[I_0122]], 7 +; CHECK-NEXT: [[ARRAYIDX7_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD7]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX7_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD7]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX7_1]], align 4 +; CHECK-NEXT: [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP23]], float [[TMP24]], float [[ACC7]]) +; CHECK-NEXT: [[ADD30]] = add nuw nsw i32 [[I_0122]], 8 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD30]], [[SUB]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY]], label [[FOR_COND31_PREHEADER]] +; CHECK: for.body33: +; CHECK-NEXT: [[I_0833:%.*]] = phi i32 [ [[INC33:%.*]], [[FOR_BODY33]] ], [ [[I_0_LCSSA]], [[FOR_COND31_PREHEADER]] ] +; CHECK-NEXT: [[ACC_0733:%.*]] = phi float [ [[TMP28:%.*]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_0833]] +; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_0833]] +; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX133]], align 4 +; CHECK-NEXT: [[TMP28]] = tail call float @llvm.fmuladd.f32(float [[TMP26]], float [[TMP27]], float [[ACC_0733]]) +; CHECK-NEXT: [[INC33]] = add nuw nsw i32 [[I_0833]], 1 +; CHECK-NEXT: [[EXITCOND_NOT33:%.*]] = icmp eq i32 [[INC33]], [[LEN]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT33]], label [[FOR_END37]], label [[FOR_BODY33]] +; CHECK: for.end37: +; CHECK-NEXT: [[ACC0_1_LCSSA:%.*]] = phi float [ [[TMP28]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ] +; CHECK-NEXT: [[SUM01:%.*]] = fadd float [[ACC1_0_LCSSA]], [[ACC0_1_LCSSA]] +; CHECK-NEXT: [[SUM23:%.*]] = fadd float [[ACC2_0_LCSSA]], [[ACC3_0_LCSSA]] +; CHECK-NEXT: [[SUM45:%.*]] = fadd float [[ACC4_0_LCSSA]], [[ACC5_0_LCSSA]] +; CHECK-NEXT: [[SUM67:%.*]] = fadd float [[ACC6_0_LCSSA]], [[ACC7_0_LCSSA]] +; CHECK-NEXT: [[SUM0123:%.*]] = fadd float [[SUM23]], [[SUM01]] +; CHECK-NEXT: [[SUM4567:%.*]] = fadd float [[SUM45]], [[SUM67]] +; CHECK-NEXT: [[ADD44]] = fadd float [[SUM4567]], [[SUM0123]] +; CHECK-NEXT: br label [[IF_END]] ; CHECK: for.body.clone: ; CHECK-NEXT: [[I_08_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] -; CHECK-NEXT: [[ACC_07_CLONE:%.*]] = phi float [ [[TMP6]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ] +; CHECK-NEXT: [[ACC_07_CLONE:%.*]] = phi float [ [[TMP31]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[I_08_CLONE]] -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 ; CHECK-NEXT: [[ARRAYIDX1_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[I_08_CLONE]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_CLONE]], align 4 -; CHECK-NEXT: [[TMP6]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[ACC_07_CLONE]]) +; CHECK-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX1_CLONE]], align 4 +; CHECK-NEXT: [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[ACC_07_CLONE]]) ; CHECK-NEXT: [[INC_CLONE]] = add nuw nsw i32 [[I_08_CLONE]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[IF_END]], label [[FOR_BODY_CLONE]] diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll index 8db7f9dd4c7882..60c76b1ad159d9 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprod_template_complex.ll @@ -1,28 +1,115 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local float @test_loop(ptr nocapture noundef readonly %data1, ptr nocapture noundef readonly %data2, i32 noundef %start_index, i32 noundef %end_index, i32 noundef %update1, i32 noundef %update2, float noundef %offset) local_unnamed_addr { ; CHECK-LABEL: define dso_local float @test_loop( -; CHECK-SAME: ptr nocapture noundef readonly [[DATA1:%.*]], ptr nocapture noundef readonly [[DATA2:%.*]], i32 noundef [[START_INDEX:%.*]], i32 noundef [[END_INDEX:%.*]], i32 noundef [[UPDATE1:%.*]], i32 noundef [[UPDATE2:%.*]], float noundef [[OFFSET:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias nocapture noundef readonly [[DATA1:%.*]], ptr noalias nocapture noundef readonly [[DATA2:%.*]], i32 noundef [[START_INDEX:%.*]], i32 noundef [[END_INDEX:%.*]], i32 noundef [[UPDATE1:%.*]], i32 noundef [[UPDATE2:%.*]], float noundef [[OFFSET:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[DATA1]], i32 [[UPDATE1]] ; CHECK-NEXT: [[INVARIANT_GEP8:%.*]] = getelementptr float, ptr [[DATA2]], i32 [[UPDATE2]] -; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[START_INDEX]], [[END_INDEX]] -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] -; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[START_INDEX]], [[ENTRY]] ] -; CHECK-NEXT: [[RESULT_011:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[END_INDEX]], -8 +; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[SUB]], [[START_INDEX]] +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_COND_PREHEADER:%.*]], label [[FOR_BODY_7:%.*]] +; CHECK: for.cond.preheader: +; CHECK-NEXT: [[RESULT0_0_LCSSA:%.*]] = phi i32 [ [[START_INDEX]], [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_7:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_6:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_5:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA4:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_4:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_3:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA6:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_2:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA7:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_1:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0_0_LCSSA8:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[RESULT0_0_LCSSA]], [[END_INDEX]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_CLONE:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.7: +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[START_INDEX]], [[ENTRY]] ], [ [[INC_7]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT6:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_6]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_5]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT4:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_4]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_3]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_2]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3_1]], [[FOR_BODY_7]] ] +; CHECK-NEXT: [[RESULT0:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3]], [[FOR_BODY_7]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[I_012]] ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP]], align 4 ; CHECK-NEXT: [[GEP9:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[I_012]] ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[GEP9]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[OFFSET]]) -; CHECK-NEXT: [[ADD3]] = fadd float [[RESULT_011]], [[TMP2]] -; CHECK-NEXT: [[INC]] = add nsw i32 [[I_012]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[END_INDEX]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: [[ADD3]] = fadd float [[RESULT0]], [[TMP2]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[I_012]], 1 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[GEP_1]], align 4 +; CHECK-NEXT: [[GEP9_1:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[GEP9_1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP3]], float [[TMP4]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_1]] = fadd float [[RESULT1]], [[TMP5]] +; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 [[I_012]], 2 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_1]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[GEP_2]], align 4 +; CHECK-NEXT: [[GEP9_2:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_1]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[GEP9_2]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_2]] = fadd float [[RESULT2]], [[TMP8]] +; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[I_012]], 3 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_2]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[GEP_3]], align 4 +; CHECK-NEXT: [[GEP9_3:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_2]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[GEP9_3]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP10]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_3]] = fadd float [[RESULT3]], [[TMP11]] +; CHECK-NEXT: [[INC_3:%.*]] = add nsw i32 [[I_012]], 4 +; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_3]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[GEP_4]], align 4 +; CHECK-NEXT: [[GEP9_4:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_3]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[GEP9_4]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_4]] = fadd float [[RESULT4]], [[TMP14]] +; CHECK-NEXT: [[INC_4:%.*]] = add nsw i32 [[I_012]], 5 +; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_4]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[GEP_5]], align 4 +; CHECK-NEXT: [[GEP9_5:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_4]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[GEP9_5]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_5]] = fadd float [[RESULT5]], [[TMP17]] +; CHECK-NEXT: [[INC_5:%.*]] = add nsw i32 [[I_012]], 6 +; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_5]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[GEP_6]], align 4 +; CHECK-NEXT: [[GEP9_6:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_5]] +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[GEP9_6]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP18]], float [[TMP19]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_6]] = fadd float [[RESULT6]], [[TMP20]] +; CHECK-NEXT: [[INC_6:%.*]] = add nsw i32 [[I_012]], 7 +; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[INC_6]] +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[GEP_7]], align 4 +; CHECK-NEXT: [[GEP9_7:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[INC_6]] +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[GEP9_7]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_7]] = fadd float [[ADD3_6]], [[TMP23]] +; CHECK-NEXT: [[INC_7]] = add nsw i32 [[I_012]], 8 +; CHECK-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp slt i32 [[INC_7]], [[SUB]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label [[FOR_COND_PREHEADER]], label [[FOR_BODY_7]] +; CHECK: for.body.clone: +; CHECK-NEXT: [[I_012_CLONE:%.*]] = phi i32 [ [[RESULT0_0_LCSSA]], [[FOR_COND_PREHEADER]] ], [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[RESULT_011_CLONE:%.*]] = phi float [ [[RESULT0_0_LCSSA8]], [[FOR_COND_PREHEADER]] ], [ [[ADD3_CLONE:%.*]], [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[GEP_CLONE:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i32 [[I_012_CLONE]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[GEP_CLONE]], align 4 +; CHECK-NEXT: [[GEP9_CLONE:%.*]] = getelementptr float, ptr [[INVARIANT_GEP8]], i32 [[I_012_CLONE]] +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[GEP9_CLONE]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP24]], float [[TMP25]], float [[OFFSET]]) +; CHECK-NEXT: [[ADD3_CLONE]] = fadd float [[RESULT_011_CLONE]], [[TMP26]] +; CHECK-NEXT: [[INC_CLONE]] = add nsw i32 [[I_012_CLONE]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[END_INDEX]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[FOR_END]], label [[FOR_BODY_CLONE]] +; CHECK: for.end: +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ [[ADD3_CLONE]], [[FOR_BODY_CLONE]] ], [ [[RESULT0_0_LCSSA8]], [[FOR_COND_PREHEADER]] ] +; CHECK-NEXT: [[ADD64:%.*]] = fadd float [[RESULT0_0_LCSSA1]], [[RESULT_0_LCSSA]] +; CHECK-NEXT: [[ADD65:%.*]] = fadd float [[RESULT0_0_LCSSA2]], [[RESULT0_0_LCSSA3]] +; CHECK-NEXT: [[ADD66:%.*]] = fadd float [[RESULT0_0_LCSSA4]], [[RESULT0_0_LCSSA5]] +; CHECK-NEXT: [[ADD67:%.*]] = fadd float [[RESULT0_0_LCSSA6]], [[RESULT0_0_LCSSA7]] +; CHECK-NEXT: [[ADD68:%.*]] = fadd float [[ADD65]], [[ADD64]] +; CHECK-NEXT: [[ADD69:%.*]] = fadd float [[ADD66]], [[ADD67]] +; CHECK-NEXT: [[ADD70:%.*]] = fadd float [[ADD69]], [[ADD68]] +; CHECK-NEXT: ret float [[ADD70]] ; entry: %invariant.gep = getelementptr float, ptr %data1, i32 %update1 diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll index 78ea995d2a297d..9922d9aa34f774 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/dotprode.ll @@ -1,41 +1,132 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_dotprode_f32_ansi(ptr nocapture noundef readonly %src1, ptr nocapture noundef readonly %src2, ptr nocapture noundef writeonly %dest, i32 noundef %len, i32 noundef %step1, i32 noundef %step2) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_dotprode_f32_ansi( -; CHECK-SAME: ptr nocapture noundef readonly [[SRC1:%.*]], ptr nocapture noundef readonly [[SRC2:%.*]], ptr nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias nocapture noundef readonly [[SRC1:%.*]], ptr noalias nocapture noundef readonly [[SRC2:%.*]], ptr noalias nocapture noundef writeonly [[DEST:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[TMP0]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[TMP0]], label [[FOR_COND_PREHEADER1:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: -; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[LEN]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]] +; CHECK-NEXT: [[CMP47110:%.*]] = icmp sgt i32 [[LEN]], 0 +; CHECK-NEXT: br i1 [[CMP47110]], label [[FOR_BODY_CLONE:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: -; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_PREHEADER]] ], [ [[ADD44:%.*]], [[FOR_END37:%.*]] ], [ [[TMP31:%.*]], [[FOR_BODY_CLONE]] ] ; CHECK-NEXT: store float [[ACC_0_LCSSA]], ptr [[DEST]], align 4 ; CHECK-NEXT: ret i32 0 +; CHECK: for.cond.preheader1: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[LEN]], -7 +; CHECK-NEXT: [[CMP1113:%.*]] = icmp ugt i32 [[LEN]], 7 +; CHECK-NEXT: br i1 [[CMP1113]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND31_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LEN]], 2147483640 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond31.preheader: +; CHECK-NEXT: [[ACC0_0_LCSSA:%.*]] = phi float [ [[TMP4:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC1_0_LCSSA:%.*]] = phi float [ [[TMP7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC2_0_LCSSA:%.*]] = phi float [ [[TMP10:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC3_0_LCSSA:%.*]] = phi float [ [[TMP13:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC4_0_LCSSA:%.*]] = phi float [ [[TMP16:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC5_0_LCSSA:%.*]] = phi float [ [[TMP19:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC6_0_LCSSA:%.*]] = phi float [ [[TMP22:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[ACC7_0_LCSSA:%.*]] = phi float [ [[TMP25:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER1]] ] +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER1]] ], [ [[TMP1]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP32132:%.*]] = icmp slt i32 [[I_0_LCSSA]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP32132]], label [[FOR_BODY33:%.*]], label [[FOR_END37]] ; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ACC_09:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_010]], [[STEP1]] +; CHECK-NEXT: [[I_0122:%.*]] = phi i32 [ [[ADD30:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC_09:%.*]] = phi float [ [[TMP4]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC1:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC2:%.*]] = phi float [ [[TMP10]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC3:%.*]] = phi float [ [[TMP13]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC4:%.*]] = phi float [ [[TMP16]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC5:%.*]] = phi float [ [[TMP19]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC6:%.*]] = phi float [ [[TMP22]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ACC7:%.*]] = phi float [ [[TMP25]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_0122]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[MUL]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[I_010]], [[STEP2]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[I_0122]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[MUL1]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP3]] = tail call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP2]], float [[ACC_09]]) -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END]], label [[FOR_BODY]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP4]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[ACC_09]]) +; CHECK-NEXT: [[ADD1:%.*]] = or disjoint i32 [[I_0122]], 1 +; CHECK-NEXT: [[ARRAYIDX1_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD1]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX1_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD1]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1_1]], align 4 +; CHECK-NEXT: [[TMP7]] = tail call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP6]], float [[ACC1]]) +; CHECK-NEXT: [[ADD2:%.*]] = or disjoint i32 [[I_0122]], 2 +; CHECK-NEXT: [[ARRAYIDX2_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD2]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD2]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2_1]], align 4 +; CHECK-NEXT: [[TMP10]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC2]]) +; CHECK-NEXT: [[ADD3:%.*]] = or disjoint i32 [[I_0122]], 3 +; CHECK-NEXT: [[ARRAYIDX3_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD3]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX3_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD3]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4 +; CHECK-NEXT: [[TMP13]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[ACC3]]) +; CHECK-NEXT: [[ADD4:%.*]] = or disjoint i32 [[I_0122]], 4 +; CHECK-NEXT: [[ARRAYIDX4_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD4]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD4]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX4_1]], align 4 +; CHECK-NEXT: [[TMP16]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[ACC4]]) +; CHECK-NEXT: [[ADD5:%.*]] = or disjoint i32 [[I_0122]], 5 +; CHECK-NEXT: [[ARRAYIDX5_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX5_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX5_1]], align 4 +; CHECK-NEXT: [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP17]], float [[TMP18]], float [[ACC5]]) +; CHECK-NEXT: [[ADD6:%.*]] = or disjoint i32 [[I_0122]], 6 +; CHECK-NEXT: [[ARRAYIDX6_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD6]] +; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD6]] +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6_1]], align 4 +; CHECK-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC6]]) +; CHECK-NEXT: [[ADD7:%.*]] = or disjoint i32 [[I_0122]], 7 +; CHECK-NEXT: [[ARRAYIDX7_0:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[ADD7]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX7_0]], align 4 +; CHECK-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[ADD7]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX7_1]], align 4 +; CHECK-NEXT: [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP23]], float [[TMP24]], float [[ACC7]]) +; CHECK-NEXT: [[ADD30]] = add nuw nsw i32 [[I_0122]], 8 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD30]], [[SUB]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY]], label [[FOR_COND31_PREHEADER]] +; CHECK: for.body33: +; CHECK-NEXT: [[I_01033:%.*]] = phi i32 [ [[INC33:%.*]], [[FOR_BODY33]] ], [ [[I_0_LCSSA]], [[FOR_COND31_PREHEADER]] ] +; CHECK-NEXT: [[ACC_0933:%.*]] = phi float [ [[TMP28:%.*]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ] +; CHECK-NEXT: [[MUL33:%.*]] = mul nsw i32 [[I_01033]], [[STEP1]] +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[MUL33]] +; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: [[MUL133:%.*]] = mul nsw i32 [[I_01033]], [[STEP2]] +; CHECK-NEXT: [[ARRAYIDX233:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[MUL133]] +; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX233]], align 4 +; CHECK-NEXT: [[TMP28]] = tail call float @llvm.fmuladd.f32(float [[TMP26]], float [[TMP27]], float [[ACC_0933]]) +; CHECK-NEXT: [[INC33]] = add nuw nsw i32 [[I_01033]], 1 +; CHECK-NEXT: [[EXITCOND_NOT33:%.*]] = icmp eq i32 [[INC33]], [[LEN]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT33]], label [[FOR_END37]], label [[FOR_BODY33]] +; CHECK: for.end37: +; CHECK-NEXT: [[ACC0_1_LCSSA:%.*]] = phi float [ [[TMP28]], [[FOR_BODY33]] ], [ [[ACC0_0_LCSSA]], [[FOR_COND31_PREHEADER]] ] +; CHECK-NEXT: [[SUM01:%.*]] = fadd float [[ACC1_0_LCSSA]], [[ACC0_1_LCSSA]] +; CHECK-NEXT: [[SUM23:%.*]] = fadd float [[ACC2_0_LCSSA]], [[ACC3_0_LCSSA]] +; CHECK-NEXT: [[SUM45:%.*]] = fadd float [[ACC4_0_LCSSA]], [[ACC5_0_LCSSA]] +; CHECK-NEXT: [[SUM67:%.*]] = fadd float [[ACC6_0_LCSSA]], [[ACC7_0_LCSSA]] +; CHECK-NEXT: [[SUM0123:%.*]] = fadd float [[SUM23]], [[SUM01]] +; CHECK-NEXT: [[SUM4567:%.*]] = fadd float [[SUM45]], [[SUM67]] +; CHECK-NEXT: [[ADD44]] = fadd float [[SUM4567]], [[SUM0123]] +; CHECK-NEXT: br label [[IF_END]] ; CHECK: for.body.clone: ; CHECK-NEXT: [[I_010_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] -; CHECK-NEXT: [[ACC_09_CLONE:%.*]] = phi float [ [[TMP6]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ] +; CHECK-NEXT: [[ACC_09_CLONE:%.*]] = phi float [ [[TMP31]], [[FOR_BODY_CLONE]] ], [ 0.000000e+00, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[MUL_CLONE:%.*]] = mul nsw i32 [[I_010_CLONE]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC1]], i32 [[MUL_CLONE]] -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 ; CHECK-NEXT: [[MUL1_CLONE:%.*]] = mul nsw i32 [[I_010_CLONE]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX2_CLONE:%.*]] = getelementptr inbounds float, ptr [[SRC2]], i32 [[MUL1_CLONE]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX2_CLONE]], align 4 -; CHECK-NEXT: [[TMP6]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[ACC_09_CLONE]]) +; CHECK-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX2_CLONE]], align 4 +; CHECK-NEXT: [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[ACC_09_CLONE]]) ; CHECK-NEXT: [[INC_CLONE]] = add nuw nsw i32 [[I_010_CLONE]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[IF_END]], label [[FOR_BODY_CLONE]] diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll index 6a8cb4868b7ea6..61470c86fb2152 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fir.ll @@ -1,28 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s %struct.fir_f32_s = type { ptr, ptr, i32, i32, i32, i16 } define dso_local noundef i32 @dsps_fir_f32_ansi(ptr nocapture noundef %fir, ptr nocapture noundef readonly %input, ptr nocapture noundef writeonly %output, i32 noundef %len) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_fir_f32_ansi( -; CHECK-SAME: ptr nocapture noundef [[FIR:%.*]], ptr nocapture noundef readonly [[INPUT:%.*]], ptr nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias nocapture noundef [[FIR:%.*]], ptr noalias nocapture noundef readonly [[INPUT:%.*]], ptr noalias nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[TMP0]], label [[FOR_COND_PREHEADER:%.*]], label [[FOR_BODY_LR_PH_CLONE:%.*]] +; CHECK-NEXT: br i1 [[TMP0]], label [[FOR_COND_PREHEADER:%.*]], label [[FOR_BODY_LR_PH_CLONE_PREHEADER:%.*]] ; CHECK: for.cond.preheader: -; CHECK-NEXT: [[CMP67:%.*]] = icmp sgt i32 [[LEN]], 0 -; CHECK-NEXT: br i1 [[CMP67]], label [[FOR_BODY_LR_PH:%.*]], label [[IF_END:%.*]] -; CHECK: for.body.lr.ph: ; CHECK-NEXT: [[DELAY:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S:%.*]], ptr [[FIR]], i32 0, i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DELAY]], align 4 ; CHECK-NEXT: [[POS:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 3 ; CHECK-NEXT: [[N:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[N]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP2]], -7 ; CHECK-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[POS]], align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: if.end: -; CHECK-NEXT: ret i32 0 ; CHECK: for.body: -; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_BODY_LR_PH]] ], [ [[SPEC_STORE_SELECT:%.*]], [[FOR_COND_CLEANUP21:%.*]] ] -; CHECK-NEXT: [[I_068:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC33:%.*]], [[FOR_COND_CLEANUP21]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_COND_PREHEADER]] ], [ [[SPEC_STORE_SELECT:%.*]], [[FOR_END:%.*]] ] +; CHECK-NEXT: [[I_068:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER]] ], [ [[INC33_MODIFY:%.*]], [[FOR_END]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_068]] ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[TMP3]] @@ -31,119 +27,286 @@ define dso_local noundef i32 @dsps_fir_f32_ansi(ptr nocapture noundef %fir, ptr ; CHECK-NEXT: [[CMP4_NOT:%.*]] = icmp slt i32 [[INC]], [[TMP2]] ; CHECK-NEXT: [[SPEC_STORE_SELECT]] = select i1 [[CMP4_NOT]], i32 [[INC]], i32 0 ; CHECK-NEXT: store i32 [[SPEC_STORE_SELECT]], ptr [[POS]], align 4 -; CHECK-NEXT: [[CMP957:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT]], [[TMP2]] -; CHECK-NEXT: br i1 [[CMP957]], label [[FOR_BODY11_LR_PH:%.*]], label [[FOR_COND18_PREHEADER:%.*]] +; CHECK-NEXT: [[CMP957:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT]], [[SUB]] +; CHECK-NEXT: br i1 [[CMP957]], label [[FOR_BODY11_LR_PH_MODIFY:%.*]], label [[FOR_COND18_PREHEADER_MODIFY:%.*]] +; CHECK: for.cond18.preheader.modify: +; CHECK-NEXT: [[N_060_MODIFY_CLONE:%.*]] = phi i32 [ [[SPEC_STORE_SELECT]], [[FOR_BODY]] ], [ [[INC16_MODIFY:%.*]], [[FOR_BODY11_MODIFY:%.*]] ] +; CHECK-NEXT: [[COEFF_POS_059_MODIFY_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC12_MODIFY:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC_058_MODIFY_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC4_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP26:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC7_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP27:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC10_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP28:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC13_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP29:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC17_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP30:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC20_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP31:%.*]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[CMP_SLT:%.*]] = icmp slt i32 [[N_060_MODIFY_CLONE]], [[TMP2]] +; CHECK-NEXT: br i1 [[CMP_SLT]], label [[FOR_BODY11_LR_PH:%.*]], label [[FOR_COND18_PREHEADER:%.*]] ; CHECK: for.body11.lr.ph: ; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[FIR]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP2]], [[SPEC_STORE_SELECT]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[COEFF_POS_059_MODIFY_CLONE]] ; CHECK-NEXT: br label [[FOR_BODY11:%.*]] +; CHECK: for.body11.lr.ph.modify: +; CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[FIR]], align 4 +; CHECK-NEXT: br label [[FOR_BODY11_MODIFY]] +; CHECK: for.body11.modify: +; CHECK-NEXT: [[N_060_MODIFY:%.*]] = phi i32 [ [[SPEC_STORE_SELECT]], [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[INC16_MODIFY]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[COEFF_POS_059_MODIFY:%.*]] = phi i32 [ 0, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[INC12_MODIFY]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC_058_MODIFY:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP24]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP25]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP26]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP27]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC10:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP28]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC13:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP29]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC17:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP30]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[ACC20:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_MODIFY]] ], [ [[TMP31]], [[FOR_BODY11_MODIFY]] ] +; CHECK-NEXT: [[INC12_MODIFY]] = add nuw nsw i32 [[COEFF_POS_059_MODIFY]], 8 +; CHECK-NEXT: [[INC16_MODIFY]] = add nsw i32 [[N_060_MODIFY]], 8 +; CHECK-NEXT: [[ADD7:%.*]] = or disjoint i32 [[COEFF_POS_059_MODIFY]], 7 +; CHECK-NEXT: [[ARRAYIDX13_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[COEFF_POS_059_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX15_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N_060_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 2 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 2 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 3 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 3 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 4 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 4 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 5 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 5 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX13_MODIFY]], i32 6 +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 6 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[ADD7]] +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX15_MODIFY]], i32 7 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX13_MODIFY]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX15_MODIFY]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX21]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX22]], align 4 +; CHECK-NEXT: [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC_058_MODIFY]]) +; CHECK-NEXT: [[TMP25]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[ACC]]) +; CHECK-NEXT: [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[ACC4]]) +; CHECK-NEXT: [[TMP27]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[ACC7]]) +; CHECK-NEXT: [[TMP28]] = tail call float @llvm.fmuladd.f32(float [[TMP16]], float [[TMP17]], float [[ACC10]]) +; CHECK-NEXT: [[TMP29]] = tail call float @llvm.fmuladd.f32(float [[TMP18]], float [[TMP19]], float [[ACC13]]) +; CHECK-NEXT: [[TMP30]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC17]]) +; CHECK-NEXT: [[TMP31]] = tail call float @llvm.fmuladd.f32(float [[TMP22]], float [[TMP23]], float [[ACC20]]) +; CHECK-NEXT: [[CMP11:%.*]] = icmp slt i32 [[INC16_MODIFY]], [[SUB]] +; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY11_MODIFY]], label [[FOR_COND18_PREHEADER_MODIFY]] +; CHECK: for.cond18.preheader.loopexit: +; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP6]], [[N_060_MODIFY_CLONE]] +; CHECK-NEXT: br label [[FOR_COND18_PREHEADER]] ; CHECK: for.cond18.preheader: -; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY]] ], [ [[TMP10:%.*]], [[FOR_BODY11]] ] -; CHECK-NEXT: [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP6]], [[FOR_BODY11]] ] -; CHECK-NEXT: [[CMP2062:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT]], 0 -; CHECK-NEXT: br i1 [[CMP2062]], label [[FOR_BODY22_LR_PH:%.*]], label [[FOR_COND_CLEANUP21]] +; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ [[ACC_058_MODIFY_CLONE]], [[FOR_COND18_PREHEADER_MODIFY]] ], [ [[TMP37:%.*]], [[FOR_COND18_PREHEADER_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ [[COEFF_POS_059_MODIFY_CLONE]], [[FOR_COND18_PREHEADER_MODIFY]] ], [ [[TMP32]], [[FOR_COND18_PREHEADER_LOOPEXIT]] ] +; CHECK-NEXT: [[TMP33:%.*]] = add nsw i32 [[SPEC_STORE_SELECT]], -7 +; CHECK-NEXT: [[CMP2062:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT]], 7 +; CHECK-NEXT: br i1 [[CMP2062]], label [[FOR_BODY22_LR_PH_MODIFY:%.*]], label [[FOR_COND_CLEANUP21:%.*]] ; CHECK: for.body22.lr.ph: -; CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[FIR]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[FIR]], align 4 ; CHECK-NEXT: br label [[FOR_BODY22:%.*]] ; CHECK: for.body11: -; CHECK-NEXT: [[N_060:%.*]] = phi i32 [ [[SPEC_STORE_SELECT]], [[FOR_BODY11_LR_PH]] ], [ [[INC16:%.*]], [[FOR_BODY11]] ] -; CHECK-NEXT: [[COEFF_POS_059:%.*]] = phi i32 [ 0, [[FOR_BODY11_LR_PH]] ], [ [[INC12:%.*]], [[FOR_BODY11]] ] -; CHECK-NEXT: [[ACC_058:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH]] ], [ [[TMP10]], [[FOR_BODY11]] ] +; CHECK-NEXT: [[N_060:%.*]] = phi i32 [ [[N_060_MODIFY_CLONE]], [[FOR_BODY11_LR_PH]] ], [ [[INC16:%.*]], [[FOR_BODY11]] ] +; CHECK-NEXT: [[COEFF_POS_059:%.*]] = phi i32 [ [[COEFF_POS_059_MODIFY_CLONE]], [[FOR_BODY11_LR_PH]] ], [ [[INC12:%.*]], [[FOR_BODY11]] ] +; CHECK-NEXT: [[ACC_058:%.*]] = phi float [ [[ACC_058_MODIFY_CLONE]], [[FOR_BODY11_LR_PH]] ], [ [[TMP37]], [[FOR_BODY11]] ] ; CHECK-NEXT: [[INC12]] = add nuw i32 [[COEFF_POS_059]], 1 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 [[COEFF_POS_059]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N_060]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP10]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[ACC_058]]) +; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP37]] = tail call float @llvm.fmuladd.f32(float [[TMP35]], float [[TMP36]], float [[ACC_058]]) ; CHECK-NEXT: [[INC16]] = add nsw i32 [[N_060]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC12]], [[TMP6]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND18_PREHEADER]], label [[FOR_BODY11]] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC16]], [[TMP2]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND18_PREHEADER_LOOPEXIT]], label [[FOR_BODY11]] ; CHECK: for.cond.cleanup21: -; CHECK-NEXT: [[ACC_1_LCSSA:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND18_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY22]] ] -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_068]] -; CHECK-NEXT: store float [[ACC_1_LCSSA]], ptr [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[INC33]] = add nuw nsw i32 [[I_068]], 1 -; CHECK-NEXT: [[EXITCOND71_NOT:%.*]] = icmp eq i32 [[INC33]], [[LEN]] -; CHECK-NEXT: br i1 [[EXITCOND71_NOT]], label [[IF_END]], label [[FOR_BODY]] +; CHECK-NEXT: [[N17_065_MODIFY_CLONE:%.*]] = phi i32 [ 0, [[FOR_COND18_PREHEADER]] ], [ [[TMP39:%.*]], [[FOR_BODY22_MODIFY:%.*]] ] +; CHECK-NEXT: [[COEFF_POS_164_MODIFY_CLONE:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_COND18_PREHEADER]] ], [ [[INC24_MODIFY:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC_163_MODIFY_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND18_PREHEADER]] ], [ [[TMP56:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC23_CLONE:%.*]] = phi float [ [[ACC_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP57:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC27_CLONE:%.*]] = phi float [ [[ACC4_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP58:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC30_CLONE:%.*]] = phi float [ [[ACC7_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP59:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC34_CLONE:%.*]] = phi float [ [[ACC10_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP60:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC37_CLONE:%.*]] = phi float [ [[ACC13_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP61:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC40_CLONE:%.*]] = phi float [ [[ACC17_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP62:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC44_CLONE:%.*]] = phi float [ [[ACC20_CLONE]], [[FOR_COND18_PREHEADER]] ], [ [[TMP63:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[CMP47:%.*]] = icmp slt i32 [[N17_065_MODIFY_CLONE]], [[SPEC_STORE_SELECT]] +; CHECK-NEXT: br i1 [[CMP47]], label [[FOR_BODY22_LR_PH:%.*]], label [[FOR_END]] +; CHECK: for.body22.lr.ph.modify: +; CHECK-NEXT: [[TMP38:%.*]] = load ptr, ptr [[FIR]], align 4 +; CHECK-NEXT: [[TMP39]] = and i32 [[SPEC_STORE_SELECT]], 2147483640 +; CHECK-NEXT: br label [[FOR_BODY22_MODIFY]] +; CHECK: for.body22.modify: +; CHECK-NEXT: [[N17_065_MODIFY:%.*]] = phi i32 [ 0, [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[INC29_MODIFY:%.*]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[COEFF_POS_164_MODIFY:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[INC24_MODIFY]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC_163_MODIFY:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP56]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC23:%.*]] = phi float [ [[ACC_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP57]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC27:%.*]] = phi float [ [[ACC4_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP58]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC30:%.*]] = phi float [ [[ACC7_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP59]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC34:%.*]] = phi float [ [[ACC10_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP60]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC37:%.*]] = phi float [ [[ACC13_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP61]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC40:%.*]] = phi float [ [[ACC17_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP62]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[ACC44:%.*]] = phi float [ [[ACC20_CLONE]], [[FOR_BODY22_LR_PH_MODIFY]] ], [ [[TMP63]], [[FOR_BODY22_MODIFY]] ] +; CHECK-NEXT: [[INC24_MODIFY]] = add nuw nsw i32 [[COEFF_POS_164_MODIFY]], 8 +; CHECK-NEXT: [[INC29_MODIFY]] = add nuw nsw i32 [[N17_065_MODIFY]], 8 +; CHECK-NEXT: [[ADD1:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 1 +; CHECK-NEXT: [[ADD2:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 2 +; CHECK-NEXT: [[ADD3:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 3 +; CHECK-NEXT: [[ADD4:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 4 +; CHECK-NEXT: [[ADD5:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 5 +; CHECK-NEXT: [[ADD6:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 6 +; CHECK-NEXT: [[ADD743:%.*]] = or disjoint i32 [[N17_065_MODIFY]], 7 +; CHECK-NEXT: [[ARRAYIDX25_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP38]], i32 [[COEFF_POS_164_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX27_MODIFY:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N17_065_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 1 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD1]] +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 2 +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD2]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 3 +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD3]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 4 +; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD4]] +; CHECK-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 5 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD5]] +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 6 +; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD6]] +; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX25_MODIFY]], i32 7 +; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[ADD743]] +; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[ARRAYIDX25_MODIFY]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[ARRAYIDX27_MODIFY]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[ARRAYIDX29]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[ARRAYIDX36]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = load float, ptr [[ARRAYIDX38]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[ARRAYIDX42]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = load float, ptr [[ARRAYIDX45]], align 4 +; CHECK-NEXT: [[TMP55:%.*]] = load float, ptr [[ARRAYIDX46]], align 4 +; CHECK-NEXT: [[TMP56]] = tail call float @llvm.fmuladd.f32(float [[TMP40]], float [[TMP41]], float [[ACC_163_MODIFY]]) +; CHECK-NEXT: [[TMP57]] = tail call float @llvm.fmuladd.f32(float [[TMP42]], float [[TMP43]], float [[ACC23]]) +; CHECK-NEXT: [[TMP58]] = tail call float @llvm.fmuladd.f32(float [[TMP44]], float [[TMP45]], float [[ACC27]]) +; CHECK-NEXT: [[TMP59]] = tail call float @llvm.fmuladd.f32(float [[TMP46]], float [[TMP47]], float [[ACC30]]) +; CHECK-NEXT: [[TMP60]] = tail call float @llvm.fmuladd.f32(float [[TMP48]], float [[TMP49]], float [[ACC34]]) +; CHECK-NEXT: [[TMP61]] = tail call float @llvm.fmuladd.f32(float [[TMP50]], float [[TMP51]], float [[ACC37]]) +; CHECK-NEXT: [[TMP62]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP53]], float [[ACC40]]) +; CHECK-NEXT: [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP54]], float [[TMP55]], float [[ACC44]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC29_MODIFY]], [[TMP33]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY22_MODIFY]], label [[FOR_COND_CLEANUP21]] ; CHECK: for.body22: -; CHECK-NEXT: [[N17_065:%.*]] = phi i32 [ 0, [[FOR_BODY22_LR_PH]] ], [ [[INC29:%.*]], [[FOR_BODY22]] ] -; CHECK-NEXT: [[COEFF_POS_164:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_BODY22_LR_PH]] ], [ [[INC24:%.*]], [[FOR_BODY22]] ] -; CHECK-NEXT: [[ACC_163:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY22_LR_PH]] ], [ [[TMP13]], [[FOR_BODY22]] ] +; CHECK-NEXT: [[N17_065:%.*]] = phi i32 [ [[N17_065_MODIFY_CLONE]], [[FOR_BODY22_LR_PH]] ], [ [[INC29:%.*]], [[FOR_BODY22]] ] +; CHECK-NEXT: [[COEFF_POS_164:%.*]] = phi i32 [ [[COEFF_POS_164_MODIFY_CLONE]], [[FOR_BODY22_LR_PH]] ], [ [[INC24:%.*]], [[FOR_BODY22]] ] +; CHECK-NEXT: [[ACC_163:%.*]] = phi float [ [[ACC_163_MODIFY_CLONE]], [[FOR_BODY22_LR_PH]] ], [ [[TMP66:%.*]], [[FOR_BODY22]] ] ; CHECK-NEXT: [[INC24]] = add nuw nsw i32 [[COEFF_POS_164]], 1 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[COEFF_POS_164]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 [[COEFF_POS_164]] +; CHECK-NEXT: [[TMP64:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[N17_065]] -; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[TMP13]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[ACC_163]]) +; CHECK-NEXT: [[TMP65:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[ACC_163]]) ; CHECK-NEXT: [[INC29]] = add nuw nsw i32 [[N17_065]], 1 ; CHECK-NEXT: [[EXITCOND70_NOT:%.*]] = icmp eq i32 [[INC29]], [[SPEC_STORE_SELECT]] -; CHECK-NEXT: br i1 [[EXITCOND70_NOT]], label [[FOR_COND_CLEANUP21]], label [[FOR_BODY22]] +; CHECK-NEXT: br i1 [[EXITCOND70_NOT]], label [[FOR_END]], label [[FOR_BODY22]] +; CHECK: for.end: +; CHECK-NEXT: [[TMP67:%.*]] = phi float [ [[ACC_163_MODIFY_CLONE]], [[FOR_COND_CLEANUP21]] ], [ [[TMP66]], [[FOR_BODY22]] ] +; CHECK-NEXT: [[ADD139:%.*]] = fadd float [[TMP67]], [[ACC23_CLONE]] +; CHECK-NEXT: [[ADD140:%.*]] = fadd float [[ACC27_CLONE]], [[ACC30_CLONE]] +; CHECK-NEXT: [[ADD141:%.*]] = fadd float [[ACC34_CLONE]], [[ACC37_CLONE]] +; CHECK-NEXT: [[ADD142:%.*]] = fadd float [[ACC40_CLONE]], [[ACC44_CLONE]] +; CHECK-NEXT: [[ADD143:%.*]] = fadd float [[ADD139]], [[ADD140]] +; CHECK-NEXT: [[ADD144:%.*]] = fadd float [[ADD141]], [[ADD142]] +; CHECK-NEXT: [[ADD145:%.*]] = fadd float [[ADD143]], [[ADD144]] +; CHECK-NEXT: [[ARRAYIDX31_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_068]] +; CHECK-NEXT: store float [[ADD145]], ptr [[ARRAYIDX31_MODIFY]], align 4 +; CHECK-NEXT: [[INC33_MODIFY]] = add nuw nsw i32 [[I_068]], 1 +; CHECK-NEXT: [[EXITCOND71_NOT_MODIFY:%.*]] = icmp eq i32 [[INC33_MODIFY]], [[LEN]] +; CHECK-NEXT: br i1 [[EXITCOND71_NOT_MODIFY]], label [[IF_END:%.*]], label [[FOR_BODY]] +; CHECK: for.body.lr.ph.clone.preheader: +; CHECK-NEXT: [[CMP151349:%.*]] = icmp sgt i32 [[LEN]], 0 +; CHECK-NEXT: br i1 [[CMP151349]], label [[FOR_BODY_LR_PH_CLONE:%.*]], label [[IF_END]] ; CHECK: for.body.lr.ph.clone: ; CHECK-NEXT: [[DELAY_CLONE:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DELAY_CLONE]], align 4 +; CHECK-NEXT: [[TMP68:%.*]] = load ptr, ptr [[DELAY_CLONE]], align 4 ; CHECK-NEXT: [[POS_CLONE:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 3 ; CHECK-NEXT: [[N_CLONE:%.*]] = getelementptr inbounds [[STRUCT_FIR_F32_S]], ptr [[FIR]], i32 0, i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_CLONE]], align 4 +; CHECK-NEXT: [[TMP69:%.*]] = load i32, ptr [[N_CLONE]], align 4 ; CHECK-NEXT: [[DOTPRE_CLONE:%.*]] = load i32, ptr [[POS_CLONE]], align 4 ; CHECK-NEXT: br label [[FOR_BODY_CLONE:%.*]] ; CHECK: for.body.clone: -; CHECK-NEXT: [[TMP16:%.*]] = phi i32 [ [[DOTPRE_CLONE]], [[FOR_BODY_LR_PH_CLONE]] ], [ [[SPEC_STORE_SELECT_CLONE:%.*]], [[FOR_COND_CLEANUP21_CLONE:%.*]] ] +; CHECK-NEXT: [[TMP70:%.*]] = phi i32 [ [[DOTPRE_CLONE]], [[FOR_BODY_LR_PH_CLONE]] ], [ [[SPEC_STORE_SELECT_CLONE:%.*]], [[FOR_COND_CLEANUP21_CLONE:%.*]] ] ; CHECK-NEXT: [[I_068_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH_CLONE]] ], [ [[INC33_CLONE:%.*]], [[FOR_COND_CLEANUP21_CLONE]] ] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_068_CLONE]] -; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 -; CHECK-NEXT: [[ARRAYIDX1_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP16]] -; CHECK-NEXT: store float [[TMP17]], ptr [[ARRAYIDX1_CLONE]], align 4 -; CHECK-NEXT: [[INC_CLONE:%.*]] = add nsw i32 [[TMP16]], 1 -; CHECK-NEXT: [[CMP4_NOT_CLONE:%.*]] = icmp slt i32 [[INC_CLONE]], [[TMP15]] +; CHECK-NEXT: [[TMP71:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[ARRAYIDX1_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 [[TMP70]] +; CHECK-NEXT: store float [[TMP71]], ptr [[ARRAYIDX1_CLONE]], align 4 +; CHECK-NEXT: [[INC_CLONE:%.*]] = add nsw i32 [[TMP70]], 1 +; CHECK-NEXT: [[CMP4_NOT_CLONE:%.*]] = icmp slt i32 [[INC_CLONE]], [[TMP69]] ; CHECK-NEXT: [[SPEC_STORE_SELECT_CLONE]] = select i1 [[CMP4_NOT_CLONE]], i32 [[INC_CLONE]], i32 0 ; CHECK-NEXT: store i32 [[SPEC_STORE_SELECT_CLONE]], ptr [[POS_CLONE]], align 4 -; CHECK-NEXT: [[CMP957_CLONE:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT_CLONE]], [[TMP15]] +; CHECK-NEXT: [[CMP957_CLONE:%.*]] = icmp slt i32 [[SPEC_STORE_SELECT_CLONE]], [[TMP69]] ; CHECK-NEXT: br i1 [[CMP957_CLONE]], label [[FOR_BODY11_LR_PH_CLONE:%.*]], label [[FOR_COND18_PREHEADER_CLONE:%.*]] ; CHECK: for.body11.lr.ph.clone: -; CHECK-NEXT: [[TMP18:%.*]] = load ptr, ptr [[FIR]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP15]], [[SPEC_STORE_SELECT_CLONE]] +; CHECK-NEXT: [[TMP72:%.*]] = load ptr, ptr [[FIR]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = sub i32 [[TMP69]], [[SPEC_STORE_SELECT_CLONE]] ; CHECK-NEXT: br label [[FOR_BODY11_CLONE:%.*]] ; CHECK: for.body11.clone: ; CHECK-NEXT: [[N_060_CLONE:%.*]] = phi i32 [ [[SPEC_STORE_SELECT_CLONE]], [[FOR_BODY11_LR_PH_CLONE]] ], [ [[INC16_CLONE:%.*]], [[FOR_BODY11_CLONE]] ] ; CHECK-NEXT: [[COEFF_POS_059_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY11_LR_PH_CLONE]] ], [ [[INC12_CLONE:%.*]], [[FOR_BODY11_CLONE]] ] -; CHECK-NEXT: [[ACC_058_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_CLONE]] ], [ [[TMP22:%.*]], [[FOR_BODY11_CLONE]] ] +; CHECK-NEXT: [[ACC_058_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY11_LR_PH_CLONE]] ], [ [[TMP76:%.*]], [[FOR_BODY11_CLONE]] ] ; CHECK-NEXT: [[INC12_CLONE]] = add nuw i32 [[COEFF_POS_059_CLONE]], 1 -; CHECK-NEXT: [[ARRAYIDX13_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 [[COEFF_POS_059_CLONE]] -; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX13_CLONE]], align 4 -; CHECK-NEXT: [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[N_060_CLONE]] -; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4 -; CHECK-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP20]], float [[TMP21]], float [[ACC_058_CLONE]]) +; CHECK-NEXT: [[ARRAYIDX13_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP72]], i32 [[COEFF_POS_059_CLONE]] +; CHECK-NEXT: [[TMP74:%.*]] = load float, ptr [[ARRAYIDX13_CLONE]], align 4 +; CHECK-NEXT: [[ARRAYIDX15_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 [[N_060_CLONE]] +; CHECK-NEXT: [[TMP75:%.*]] = load float, ptr [[ARRAYIDX15_CLONE]], align 4 +; CHECK-NEXT: [[TMP76]] = tail call float @llvm.fmuladd.f32(float [[TMP74]], float [[TMP75]], float [[ACC_058_CLONE]]) ; CHECK-NEXT: [[INC16_CLONE]] = add nsw i32 [[N_060_CLONE]], 1 -; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC12_CLONE]], [[TMP19]] +; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC12_CLONE]], [[TMP73]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[FOR_COND18_PREHEADER_CLONE]], label [[FOR_BODY11_CLONE]] ; CHECK: for.cond18.preheader.clone: -; CHECK-NEXT: [[ACC_0_LCSSA_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_CLONE]] ], [ [[TMP22]], [[FOR_BODY11_CLONE]] ] -; CHECK-NEXT: [[COEFF_POS_0_LCSSA_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY_CLONE]] ], [ [[TMP19]], [[FOR_BODY11_CLONE]] ] +; CHECK-NEXT: [[ACC_0_LCSSA_CLONE:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_CLONE]] ], [ [[TMP76]], [[FOR_BODY11_CLONE]] ] +; CHECK-NEXT: [[COEFF_POS_0_LCSSA_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY_CLONE]] ], [ [[TMP73]], [[FOR_BODY11_CLONE]] ] ; CHECK-NEXT: [[CMP2062_CLONE:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT_CLONE]], 0 ; CHECK-NEXT: br i1 [[CMP2062_CLONE]], label [[FOR_BODY22_LR_PH_CLONE:%.*]], label [[FOR_COND_CLEANUP21_CLONE]] ; CHECK: for.body22.lr.ph.clone: -; CHECK-NEXT: [[TMP23:%.*]] = load ptr, ptr [[FIR]], align 4 +; CHECK-NEXT: [[TMP77:%.*]] = load ptr, ptr [[FIR]], align 4 ; CHECK-NEXT: br label [[FOR_BODY22_CLONE:%.*]] ; CHECK: for.body22.clone: ; CHECK-NEXT: [[N17_065_CLONE:%.*]] = phi i32 [ 0, [[FOR_BODY22_LR_PH_CLONE]] ], [ [[INC29_CLONE:%.*]], [[FOR_BODY22_CLONE]] ] ; CHECK-NEXT: [[COEFF_POS_164_CLONE:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA_CLONE]], [[FOR_BODY22_LR_PH_CLONE]] ], [ [[INC24_CLONE:%.*]], [[FOR_BODY22_CLONE]] ] -; CHECK-NEXT: [[ACC_163_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_BODY22_LR_PH_CLONE]] ], [ [[TMP26:%.*]], [[FOR_BODY22_CLONE]] ] +; CHECK-NEXT: [[ACC_163_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_BODY22_LR_PH_CLONE]] ], [ [[TMP80:%.*]], [[FOR_BODY22_CLONE]] ] ; CHECK-NEXT: [[INC24_CLONE]] = add nuw nsw i32 [[COEFF_POS_164_CLONE]], 1 -; CHECK-NEXT: [[ARRAYIDX25_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[COEFF_POS_164_CLONE]] -; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX25_CLONE]], align 4 -; CHECK-NEXT: [[ARRAYIDX27_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[N17_065_CLONE]] -; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX27_CLONE]], align 4 -; CHECK-NEXT: [[TMP26]] = tail call float @llvm.fmuladd.f32(float [[TMP24]], float [[TMP25]], float [[ACC_163_CLONE]]) +; CHECK-NEXT: [[ARRAYIDX25_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 [[COEFF_POS_164_CLONE]] +; CHECK-NEXT: [[TMP78:%.*]] = load float, ptr [[ARRAYIDX25_CLONE]], align 4 +; CHECK-NEXT: [[ARRAYIDX27_CLONE:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 [[N17_065_CLONE]] +; CHECK-NEXT: [[TMP79:%.*]] = load float, ptr [[ARRAYIDX27_CLONE]], align 4 +; CHECK-NEXT: [[TMP80]] = tail call float @llvm.fmuladd.f32(float [[TMP78]], float [[TMP79]], float [[ACC_163_CLONE]]) ; CHECK-NEXT: [[INC29_CLONE]] = add nuw nsw i32 [[N17_065_CLONE]], 1 ; CHECK-NEXT: [[EXITCOND70_NOT_CLONE:%.*]] = icmp eq i32 [[INC29_CLONE]], [[SPEC_STORE_SELECT_CLONE]] ; CHECK-NEXT: br i1 [[EXITCOND70_NOT_CLONE]], label [[FOR_COND_CLEANUP21_CLONE]], label [[FOR_BODY22_CLONE]] ; CHECK: for.cond.cleanup21.clone: -; CHECK-NEXT: [[ACC_1_LCSSA_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_COND18_PREHEADER_CLONE]] ], [ [[TMP26]], [[FOR_BODY22_CLONE]] ] +; CHECK-NEXT: [[ACC_1_LCSSA_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_COND18_PREHEADER_CLONE]] ], [ [[TMP80]], [[FOR_BODY22_CLONE]] ] ; CHECK-NEXT: [[ARRAYIDX31_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_068_CLONE]] ; CHECK-NEXT: store float [[ACC_1_LCSSA_CLONE]], ptr [[ARRAYIDX31_CLONE]], align 4 ; CHECK-NEXT: [[INC33_CLONE]] = add nuw nsw i32 [[I_068_CLONE]], 1 ; CHECK-NEXT: [[EXITCOND71_NOT_CLONE:%.*]] = icmp eq i32 [[INC33_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND71_NOT_CLONE]], label [[IF_END]], label [[FOR_BODY_CLONE]] +; CHECK: if.end: +; CHECK-NEXT: ret i32 0 ; entry: %0 = icmp sgt i32 %len, 2 diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll index 875710cf61b86c..e7a15e8558512f 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/fird.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s %struct.fir_f32_s = type { ptr, ptr, i32, i32, i32, i16 } define dso_local noundef i32 @dsps_fird_f32_ansi(ptr nocapture noundef %fir, ptr nocapture noundef readonly %input, ptr nocapture noundef writeonly %output, i32 noundef %len) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_fird_f32_ansi( -; CHECK-SAME: ptr nocapture noundef [[FIR:%.*]], ptr nocapture noundef readonly [[INPUT:%.*]], ptr nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias nocapture noundef [[FIR:%.*]], ptr noalias nocapture noundef readonly [[INPUT:%.*]], ptr noalias nocapture noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP77:%.*]] = icmp sgt i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[CMP77]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] @@ -18,32 +18,57 @@ define dso_local noundef i32 @dsps_fird_f32_ansi(ptr nocapture noundef %fir, ptr ; CHECK-NEXT: [[POS9_PROMOTED:%.*]] = load i32, ptr [[POS]], align 4 ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; CHECK: for.cond1.preheader: -; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[POS9_PROMOTED]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[TMP4:%.*]], [[FOR_COND_CLEANUP26:%.*]] ] -; CHECK-NEXT: [[I_080:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC39:%.*]], [[FOR_COND_CLEANUP26]] ] -; CHECK-NEXT: [[INPUT_ADDR_078:%.*]] = phi ptr [ [[INPUT]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INPUT_ADDR_1_LCSSA:%.*]], [[FOR_COND_CLEANUP26]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[POS9_PROMOTED]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[TMP4:%.*]], [[FOR_END141:%.*]] ] +; CHECK-NEXT: [[I_080:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC152:%.*]], [[FOR_END141]] ] +; CHECK-NEXT: [[INPUT_ADDR_078:%.*]] = phi ptr [ [[INPUT]], [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INPUT_ADDR_1_LCSSA:%.*]], [[FOR_END141]] ] ; CHECK-NEXT: br i1 [[CMP263]], label [[FOR_BODY4_LR_PH:%.*]], label [[FOR_COND_CLEANUP3:%.*]] ; CHECK: for.body4.lr.ph: ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DELAY]], align 4 ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[LEN]], [[FOR_COND_CLEANUP26]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[LEN]], [[FOR_END141]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge: -; CHECK-NEXT: store i32 [[SPEC_SELECT:%.*]], ptr [[POS]], align 4 +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY4]] ] +; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY4]] ] +; CHECK-NEXT: store i32 [[SPEC_SELECT_LCSSA]], ptr [[POS]], align 4 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] ; CHECK: for.cond.cleanup3: -; CHECK-NEXT: [[TMP4]] = phi i32 [ [[SPEC_SELECT]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE:%.*]] ], [ [[TMP2]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[INPUT_ADDR_1_LCSSA]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE]] ], [ [[INPUT_ADDR_078]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[CMP1266:%.*]] = icmp slt i32 [[TMP4]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP1266]], label [[FOR_BODY14_LR_PH:%.*]], label [[FOR_COND23_PREHEADER:%.*]] +; CHECK-NEXT: [[TMP4]] = phi i32 [ [[SPEC_SELECT_LCSSA]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE:%.*]] ], [ [[TMP2]], [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[INPUT_ADDR_1_LCSSA]] = phi ptr [ [[INCDEC_PTR_LCSSA]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE]] ], [ [[INPUT_ADDR_078]], [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[ADD269:%.*]] = add nsw i32 [[TMP4]], 8 +; CHECK-NEXT: [[CMP1266:%.*]] = icmp sgt i32 [[ADD269]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP1266]], label [[FOR_COND63_PREHEADER:%.*]], label [[FOR_BODY14_LR_PH:%.*]] ; CHECK: for.body14.lr.ph: ; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[FIR]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DELAY]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP1]], [[TMP4]] -; CHECK-NEXT: br label [[FOR_BODY14:%.*]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP6]], i32 28 +; CHECK-NEXT: [[TMP7:%.*]] = shl i32 [[TMP4]], 2 +; CHECK-NEXT: [[SCEVGEP101:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP105:%.*]] = getelementptr i8, ptr [[TMP6]], i32 24 +; CHECK-NEXT: [[SCEVGEP106:%.*]] = getelementptr i8, ptr [[SCEVGEP105]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP108:%.*]] = getelementptr i8, ptr [[TMP6]], i32 20 +; CHECK-NEXT: [[SCEVGEP109:%.*]] = getelementptr i8, ptr [[SCEVGEP108]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP111:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[SCEVGEP112:%.*]] = getelementptr i8, ptr [[SCEVGEP111]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP114:%.*]] = getelementptr i8, ptr [[TMP6]], i32 12 +; CHECK-NEXT: [[SCEVGEP115:%.*]] = getelementptr i8, ptr [[SCEVGEP114]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP117:%.*]] = getelementptr i8, ptr [[TMP6]], i32 8 +; CHECK-NEXT: [[SCEVGEP118:%.*]] = getelementptr i8, ptr [[SCEVGEP117]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP120:%.*]] = getelementptr i8, ptr [[TMP6]], i32 4 +; CHECK-NEXT: [[SCEVGEP121:%.*]] = getelementptr i8, ptr [[SCEVGEP120]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP123:%.*]] = getelementptr i8, ptr [[TMP6]], i32 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP127:%.*]] = getelementptr i8, ptr [[TMP5]], i32 28 +; CHECK-NEXT: [[SCEVGEP129:%.*]] = getelementptr i8, ptr [[TMP5]], i32 24 +; CHECK-NEXT: [[SCEVGEP131:%.*]] = getelementptr i8, ptr [[TMP5]], i32 20 +; CHECK-NEXT: [[SCEVGEP133:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16 +; CHECK-NEXT: [[SCEVGEP135:%.*]] = getelementptr i8, ptr [[TMP5]], i32 12 +; CHECK-NEXT: [[SCEVGEP137:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; CHECK-NEXT: [[SCEVGEP139:%.*]] = getelementptr i8, ptr [[TMP5]], i32 4 +; CHECK-NEXT: br label [[FOR_BODY14_7:%.*]] ; CHECK: for.body4: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY4]] ], [ [[TMP0]], [[FOR_BODY4_LR_PH]] ] ; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[TMP2]], [[FOR_BODY4_LR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY4]] ] -; CHECK-NEXT: [[K_065:%.*]] = phi i32 [ 0, [[FOR_BODY4_LR_PH]] ], [ [[INC8:%.*]], [[FOR_BODY4]] ] ; CHECK-NEXT: [[INPUT_ADDR_164:%.*]] = phi ptr [ [[INPUT_ADDR_078]], [[FOR_BODY4_LR_PH]] ], [ [[INCDEC_PTR]], [[FOR_BODY4]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[INPUT_ADDR_164]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[INPUT_ADDR_164]], align 4 @@ -52,51 +77,247 @@ define dso_local noundef i32 @dsps_fird_f32_ansi(ptr nocapture noundef %fir, ptr ; CHECK-NEXT: store float [[TMP9]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP6_NOT:%.*]] = icmp slt i32 [[INC]], [[TMP1]] ; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[CMP6_NOT]], i32 [[INC]], i32 0 -; CHECK-NEXT: [[INC8]] = add nuw nsw i32 [[K_065]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC8]], [[TMP0]] +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE]], label [[FOR_BODY4]] -; CHECK: for.cond23.preheader: -; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP14:%.*]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP3]] ], [ [[TMP7]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[CMP2572:%.*]] = icmp sgt i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[CMP2572]], label [[FOR_BODY27_LR_PH:%.*]], label [[FOR_COND_CLEANUP26]] +; CHECK: for.cond63.preheader: +; CHECK-NEXT: [[ACC_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP18:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC_1_LCSSA2:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP21:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC_2_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP24:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC_3_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP27:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC_4_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP30:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC_5_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP33:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC_6_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP36:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC_7_LCSSA:%.*]] = phi float [ 0.000000e+00, [[FOR_COND_CLEANUP3]] ], [ [[TMP39:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[COEFF_POS_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP3]] ], [ [[LSR_IV_NEXT126:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[N_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[FOR_COND_CLEANUP3]] ], [ [[LSR_IV_NEXT100:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[CMP2572:%.*]] = icmp slt i32 [[N_0_LCSSA]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP2572]], label [[FOR_BODY27_LR_PH:%.*]], label [[FOR_COND_CLEANUP26:%.*]] ; CHECK: for.body27.lr.ph: ; CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[FIR]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DELAY]], align 4 -; CHECK-NEXT: br label [[FOR_BODY27:%.*]] -; CHECK: for.body14: -; CHECK-NEXT: [[N_069:%.*]] = phi i32 [ [[TMP4]], [[FOR_BODY14_LR_PH]] ], [ [[INC20:%.*]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[COEFF_POS_068:%.*]] = phi i32 [ 0, [[FOR_BODY14_LR_PH]] ], [ [[INC15:%.*]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[ACC_067:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP14]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[INC15]] = add nuw i32 [[COEFF_POS_068]], 1 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 [[COEFF_POS_068]] -; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 [[N_069]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 -; CHECK-NEXT: [[TMP14]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[ACC_067]]) -; CHECK-NEXT: [[INC20]] = add nsw i32 [[N_069]], 1 -; CHECK-NEXT: [[EXITCOND83_NOT:%.*]] = icmp eq i32 [[INC15]], [[TMP7]] -; CHECK-NEXT: br i1 [[EXITCOND83_NOT]], label [[FOR_COND23_PREHEADER]], label [[FOR_BODY14]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[COEFF_POS_0_LCSSA]], [[TMP1]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i32 [[TMP1]], [[N_0_LCSSA]] +; CHECK-NEXT: [[TMP14:%.*]] = shl i32 [[N_0_LCSSA]], 2 +; CHECK-NEXT: [[SCEVGEP144:%.*]] = getelementptr i8, ptr [[TMP11]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = shl i32 [[COEFF_POS_0_LCSSA]], 2 +; CHECK-NEXT: [[SCEVGEP147:%.*]] = getelementptr i8, ptr [[TMP10]], i32 [[TMP15]] +; CHECK-NEXT: br label [[FOR_BODY14_CLONE:%.*]] +; CHECK: for.body14.7: +; CHECK-NEXT: [[LSR_IV125:%.*]] = phi i32 [ 0, [[FOR_BODY14_LR_PH]] ], [ [[LSR_IV_NEXT126]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[LSR_IV102:%.*]] = phi i32 [ 0, [[FOR_BODY14_LR_PH]] ], [ [[LSR_IV_NEXT103:%.*]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[LSR_IV99:%.*]] = phi i32 [ [[TMP4]], [[FOR_BODY14_LR_PH]] ], [ [[LSR_IV_NEXT100]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP18]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC3:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP21]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC4:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP24]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC5:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP27]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP30]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP33]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC8:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP36]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[ACC9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY14_LR_PH]] ], [ [[TMP39]], [[FOR_BODY14_7]] ] +; CHECK-NEXT: [[SCEVGEP141:%.*]] = getelementptr i8, ptr [[TMP5]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[SCEVGEP141]], align 4 +; CHECK-NEXT: [[SCEVGEP124:%.*]] = getelementptr i8, ptr [[SCEVGEP123]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCEVGEP124]], align 4 +; CHECK-NEXT: [[TMP18]] = tail call float @llvm.fmuladd.f32(float [[TMP16]], float [[TMP17]], float [[ACC]]) +; CHECK-NEXT: [[SCEVGEP140:%.*]] = getelementptr i8, ptr [[SCEVGEP139]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[SCEVGEP140]], align 4 +; CHECK-NEXT: [[SCEVGEP122:%.*]] = getelementptr i8, ptr [[SCEVGEP121]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[SCEVGEP122]], align 4 +; CHECK-NEXT: [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP19]], float [[TMP20]], float [[ACC3]]) +; CHECK-NEXT: [[SCEVGEP138:%.*]] = getelementptr i8, ptr [[SCEVGEP137]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[SCEVGEP138]], align 4 +; CHECK-NEXT: [[SCEVGEP119:%.*]] = getelementptr i8, ptr [[SCEVGEP118]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[SCEVGEP119]], align 4 +; CHECK-NEXT: [[TMP24]] = tail call float @llvm.fmuladd.f32(float [[TMP22]], float [[TMP23]], float [[ACC4]]) +; CHECK-NEXT: [[SCEVGEP136:%.*]] = getelementptr i8, ptr [[SCEVGEP135]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCEVGEP136]], align 4 +; CHECK-NEXT: [[SCEVGEP116:%.*]] = getelementptr i8, ptr [[SCEVGEP115]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[SCEVGEP116]], align 4 +; CHECK-NEXT: [[TMP27]] = tail call float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[ACC5]]) +; CHECK-NEXT: [[SCEVGEP134:%.*]] = getelementptr i8, ptr [[SCEVGEP133]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP28:%.*]] = load float, ptr [[SCEVGEP134]], align 4 +; CHECK-NEXT: [[SCEVGEP113:%.*]] = getelementptr i8, ptr [[SCEVGEP112]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[SCEVGEP113]], align 4 +; CHECK-NEXT: [[TMP30]] = tail call float @llvm.fmuladd.f32(float [[TMP28]], float [[TMP29]], float [[ACC6]]) +; CHECK-NEXT: [[SCEVGEP132:%.*]] = getelementptr i8, ptr [[SCEVGEP131]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP31:%.*]] = load float, ptr [[SCEVGEP132]], align 4 +; CHECK-NEXT: [[SCEVGEP110:%.*]] = getelementptr i8, ptr [[SCEVGEP109]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[SCEVGEP110]], align 4 +; CHECK-NEXT: [[TMP33]] = tail call float @llvm.fmuladd.f32(float [[TMP31]], float [[TMP32]], float [[ACC7]]) +; CHECK-NEXT: [[SCEVGEP130:%.*]] = getelementptr i8, ptr [[SCEVGEP129]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[SCEVGEP130]], align 4 +; CHECK-NEXT: [[SCEVGEP107:%.*]] = getelementptr i8, ptr [[SCEVGEP106]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[SCEVGEP107]], align 4 +; CHECK-NEXT: [[TMP36]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[ACC8]]) +; CHECK-NEXT: [[SCEVGEP128:%.*]] = getelementptr i8, ptr [[SCEVGEP127]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[SCEVGEP128]], align 4 +; CHECK-NEXT: [[SCEVGEP104:%.*]] = getelementptr i8, ptr [[SCEVGEP101]], i32 [[LSR_IV102]] +; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[SCEVGEP104]], align 4 +; CHECK-NEXT: [[TMP39]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[ACC9]]) +; CHECK-NEXT: [[LSR_IV_NEXT100]] = add nsw i32 [[LSR_IV99]], 8 +; CHECK-NEXT: [[TMP40:%.*]] = add i32 [[LSR_IV_NEXT100]], 8 +; CHECK-NEXT: [[LSR_IV_NEXT103]] = add nuw i32 [[LSR_IV102]], 32 +; CHECK-NEXT: [[LSR_IV_NEXT126]] = add nuw i32 [[LSR_IV125]], 8 +; CHECK-NEXT: [[EXITCOND83_NOT_7:%.*]] = icmp sgt i32 [[TMP40]], [[TMP1]] +; CHECK-NEXT: br i1 [[EXITCOND83_NOT_7]], label [[FOR_COND63_PREHEADER]], label [[FOR_BODY14_7]] +; CHECK: for.body79.lr.ph: +; CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[FIR]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DELAY]], align 4 +; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[TMP4]], 2147483640 +; CHECK-NEXT: [[SCEVGEP150:%.*]] = getelementptr i8, ptr [[TMP42]], i32 28 +; CHECK-NEXT: [[SCEVGEP154:%.*]] = getelementptr i8, ptr [[TMP42]], i32 24 +; CHECK-NEXT: [[SCEVGEP156:%.*]] = getelementptr i8, ptr [[TMP42]], i32 20 +; CHECK-NEXT: [[SCEVGEP158:%.*]] = getelementptr i8, ptr [[TMP42]], i32 16 +; CHECK-NEXT: [[SCEVGEP160:%.*]] = getelementptr i8, ptr [[TMP42]], i32 12 +; CHECK-NEXT: [[SCEVGEP162:%.*]] = getelementptr i8, ptr [[TMP42]], i32 8 +; CHECK-NEXT: [[SCEVGEP164:%.*]] = getelementptr i8, ptr [[TMP42]], i32 4 +; CHECK-NEXT: [[SCEVGEP169:%.*]] = getelementptr i8, ptr [[TMP41]], i32 28 +; CHECK-NEXT: [[TMP44:%.*]] = shl i32 [[COEFF_POS_1_LCSSA:%.*]], 2 +; CHECK-NEXT: [[SCEVGEP170:%.*]] = getelementptr i8, ptr [[SCEVGEP169]], i32 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP172:%.*]] = getelementptr i8, ptr [[TMP41]], i32 24 +; CHECK-NEXT: [[SCEVGEP173:%.*]] = getelementptr i8, ptr [[SCEVGEP172]], i32 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP175:%.*]] = getelementptr i8, ptr [[TMP41]], i32 20 +; CHECK-NEXT: [[SCEVGEP176:%.*]] = getelementptr i8, ptr [[SCEVGEP175]], i32 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP178:%.*]] = getelementptr i8, ptr [[TMP41]], i32 16 +; CHECK-NEXT: [[SCEVGEP179:%.*]] = getelementptr i8, ptr [[SCEVGEP178]], i32 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP181:%.*]] = getelementptr i8, ptr [[TMP41]], i32 12 +; CHECK-NEXT: [[SCEVGEP182:%.*]] = getelementptr i8, ptr [[SCEVGEP181]], i32 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP184:%.*]] = getelementptr i8, ptr [[TMP41]], i32 8 +; CHECK-NEXT: [[SCEVGEP185:%.*]] = getelementptr i8, ptr [[SCEVGEP184]], i32 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP187:%.*]] = getelementptr i8, ptr [[TMP41]], i32 4 +; CHECK-NEXT: [[SCEVGEP188:%.*]] = getelementptr i8, ptr [[SCEVGEP187]], i32 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP190:%.*]] = getelementptr i8, ptr [[TMP41]], i32 [[TMP44]] +; CHECK-NEXT: br label [[FOR_BODY27_7:%.*]] +; CHECK: for.body14.clone: +; CHECK-NEXT: [[LSR_IV148:%.*]] = phi ptr [ [[SCEVGEP149:%.*]], [[FOR_BODY14_CLONE]] ], [ [[SCEVGEP147]], [[FOR_BODY27_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV145:%.*]] = phi ptr [ [[SCEVGEP146:%.*]], [[FOR_BODY14_CLONE]] ], [ [[SCEVGEP144]], [[FOR_BODY27_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV142:%.*]] = phi i32 [ [[LSR_IV_NEXT143:%.*]], [[FOR_BODY14_CLONE]] ], [ [[TMP13]], [[FOR_BODY27_LR_PH]] ] +; CHECK-NEXT: [[ACC_067_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY27_LR_PH]] ], [ [[TMP47:%.*]], [[FOR_BODY14_CLONE]] ] +; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[LSR_IV148]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[LSR_IV145]], align 4 +; CHECK-NEXT: [[TMP47]] = tail call float @llvm.fmuladd.f32(float [[TMP45]], float [[TMP46]], float [[ACC_067_CLONE]]) +; CHECK-NEXT: [[LSR_IV_NEXT143]] = add i32 [[LSR_IV142]], -1 +; CHECK-NEXT: [[SCEVGEP146]] = getelementptr i8, ptr [[LSR_IV145]], i32 4 +; CHECK-NEXT: [[SCEVGEP149]] = getelementptr i8, ptr [[LSR_IV148]], i32 4 +; CHECK-NEXT: [[EXITCOND83_NOT_CLONE:%.*]] = icmp eq i32 [[LSR_IV_NEXT143]], 0 +; CHECK-NEXT: br i1 [[EXITCOND83_NOT_CLONE]], label [[FOR_COND_CLEANUP26_LOOPEXIT:%.*]], label [[FOR_BODY14_CLONE]] +; CHECK: for.cond130.preheader: +; CHECK-NEXT: [[ACC_0_LCSSA_CLONE:%.*]] = phi float [ [[ACC_1_LCSSA:%.*]], [[FOR_COND_CLEANUP26]] ], [ [[TMP51:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC_1_LCSSA2_CLONE:%.*]] = phi float [ [[ACC_1_LCSSA2]], [[FOR_COND_CLEANUP26]] ], [ [[TMP54:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC_2_LCSSA_CLONE:%.*]] = phi float [ [[ACC_2_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP57:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC_3_LCSSA_CLONE:%.*]] = phi float [ [[ACC_3_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP60:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC_4_LCSSA_CLONE:%.*]] = phi float [ [[ACC_4_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP63:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC_5_LCSSA_CLONE:%.*]] = phi float [ [[ACC_5_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP66:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC_6_LCSSA_CLONE:%.*]] = phi float [ [[ACC_6_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP69:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC_7_LCSSA_CLONE:%.*]] = phi float [ [[ACC_7_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[TMP72:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[COEFF_POS_0_LCSSA_CLONE:%.*]] = phi i32 [ [[COEFF_POS_1_LCSSA]], [[FOR_COND_CLEANUP26]] ], [ [[LSR_IV_NEXT168:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[N_0_LCSSA_CLONE:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP26]] ], [ [[TMP43]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[CMP2572_CLONE:%.*]] = icmp slt i32 [[N_0_LCSSA_CLONE]], [[TMP4]] +; CHECK-NEXT: br i1 [[CMP2572_CLONE]], label [[FOR_BODY133_LR_PH:%.*]], label [[FOR_END141]] +; CHECK: for.cond.cleanup26.loopexit: +; CHECK-NEXT: [[DOTLCSSA207:%.*]] = phi float [ [[TMP47]], [[FOR_BODY14_CLONE]] ] +; CHECK-NEXT: [[N_0_LCSSA_NEG:%.*]] = sub i32 0, [[N_0_LCSSA]] +; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP12]], [[N_0_LCSSA_NEG]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP26]] ; CHECK: for.cond.cleanup26: -; CHECK-NEXT: [[ACC_1_LCSSA:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND23_PREHEADER]] ], [ [[TMP17:%.*]], [[FOR_BODY27]] ] -; CHECK-NEXT: [[INC39]] = add nuw nsw i32 [[I_080]], 1 +; CHECK-NEXT: [[COEFF_POS_1_LCSSA]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_COND63_PREHEADER]] ], [ [[TMP48]], [[FOR_COND_CLEANUP26_LOOPEXIT]] ] +; CHECK-NEXT: [[ACC_1_LCSSA]] = phi float [ [[ACC_0_LCSSA]], [[FOR_COND63_PREHEADER]] ], [ [[DOTLCSSA207]], [[FOR_COND_CLEANUP26_LOOPEXIT]] ] +; CHECK-NEXT: [[EXITCOND85_NOT:%.*]] = icmp slt i32 [[TMP4]], 8 +; CHECK-NEXT: br i1 [[EXITCOND85_NOT]], label [[FOR_COND130_PREHEADER:%.*]], label [[FOR_BODY79_LR_PH:%.*]] +; CHECK: for.body27.7: +; CHECK-NEXT: [[LSR_IV167:%.*]] = phi i32 [ [[COEFF_POS_1_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[LSR_IV_NEXT168]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[LSR_IV151:%.*]] = phi i32 [ 0, [[FOR_BODY79_LR_PH]] ], [ [[LSR_IV_NEXT152:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ADD76310:%.*]] = phi i32 [ 8, [[FOR_BODY79_LR_PH]] ], [ [[ADD76:%.*]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC38:%.*]] = phi float [ [[ACC_1_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP51]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC39:%.*]] = phi float [ [[ACC_1_LCSSA2]], [[FOR_BODY79_LR_PH]] ], [ [[TMP54]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC40:%.*]] = phi float [ [[ACC_2_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP57]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC41:%.*]] = phi float [ [[ACC_3_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP60]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC42:%.*]] = phi float [ [[ACC_4_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP63]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC43:%.*]] = phi float [ [[ACC_5_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP66]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC44:%.*]] = phi float [ [[ACC_6_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP69]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[ACC45:%.*]] = phi float [ [[ACC_7_LCSSA]], [[FOR_BODY79_LR_PH]] ], [ [[TMP72]], [[FOR_BODY27_7]] ] +; CHECK-NEXT: [[SCEVGEP191:%.*]] = getelementptr i8, ptr [[SCEVGEP190]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[SCEVGEP191]], align 4 +; CHECK-NEXT: [[SCEVGEP166:%.*]] = getelementptr i8, ptr [[TMP42]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP50:%.*]] = load float, ptr [[SCEVGEP166]], align 4 +; CHECK-NEXT: [[TMP51]] = tail call float @llvm.fmuladd.f32(float [[TMP49]], float [[TMP50]], float [[ACC38]]) +; CHECK-NEXT: [[SCEVGEP189:%.*]] = getelementptr i8, ptr [[SCEVGEP188]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCEVGEP189]], align 4 +; CHECK-NEXT: [[SCEVGEP165:%.*]] = getelementptr i8, ptr [[SCEVGEP164]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[SCEVGEP165]], align 4 +; CHECK-NEXT: [[TMP54]] = tail call float @llvm.fmuladd.f32(float [[TMP52]], float [[TMP53]], float [[ACC39]]) +; CHECK-NEXT: [[SCEVGEP186:%.*]] = getelementptr i8, ptr [[SCEVGEP185]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP55:%.*]] = load float, ptr [[SCEVGEP186]], align 4 +; CHECK-NEXT: [[SCEVGEP163:%.*]] = getelementptr i8, ptr [[SCEVGEP162]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP56:%.*]] = load float, ptr [[SCEVGEP163]], align 4 +; CHECK-NEXT: [[TMP57]] = tail call float @llvm.fmuladd.f32(float [[TMP55]], float [[TMP56]], float [[ACC40]]) +; CHECK-NEXT: [[SCEVGEP183:%.*]] = getelementptr i8, ptr [[SCEVGEP182]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP58:%.*]] = load float, ptr [[SCEVGEP183]], align 4 +; CHECK-NEXT: [[SCEVGEP161:%.*]] = getelementptr i8, ptr [[SCEVGEP160]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[SCEVGEP161]], align 4 +; CHECK-NEXT: [[TMP60]] = tail call float @llvm.fmuladd.f32(float [[TMP58]], float [[TMP59]], float [[ACC41]]) +; CHECK-NEXT: [[SCEVGEP180:%.*]] = getelementptr i8, ptr [[SCEVGEP179]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[SCEVGEP180]], align 4 +; CHECK-NEXT: [[SCEVGEP159:%.*]] = getelementptr i8, ptr [[SCEVGEP158]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[SCEVGEP159]], align 4 +; CHECK-NEXT: [[TMP63]] = tail call float @llvm.fmuladd.f32(float [[TMP61]], float [[TMP62]], float [[ACC42]]) +; CHECK-NEXT: [[SCEVGEP177:%.*]] = getelementptr i8, ptr [[SCEVGEP176]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP64:%.*]] = load float, ptr [[SCEVGEP177]], align 4 +; CHECK-NEXT: [[SCEVGEP157:%.*]] = getelementptr i8, ptr [[SCEVGEP156]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP65:%.*]] = load float, ptr [[SCEVGEP157]], align 4 +; CHECK-NEXT: [[TMP66]] = tail call float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[ACC43]]) +; CHECK-NEXT: [[SCEVGEP174:%.*]] = getelementptr i8, ptr [[SCEVGEP173]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP67:%.*]] = load float, ptr [[SCEVGEP174]], align 4 +; CHECK-NEXT: [[SCEVGEP155:%.*]] = getelementptr i8, ptr [[SCEVGEP154]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP68:%.*]] = load float, ptr [[SCEVGEP155]], align 4 +; CHECK-NEXT: [[TMP69]] = tail call float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[ACC44]]) +; CHECK-NEXT: [[SCEVGEP171:%.*]] = getelementptr i8, ptr [[SCEVGEP170]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP70:%.*]] = load float, ptr [[SCEVGEP171]], align 4 +; CHECK-NEXT: [[SCEVGEP153:%.*]] = getelementptr i8, ptr [[SCEVGEP150]], i32 [[LSR_IV151]] +; CHECK-NEXT: [[TMP71:%.*]] = load float, ptr [[SCEVGEP153]], align 4 +; CHECK-NEXT: [[TMP72]] = tail call float @llvm.fmuladd.f32(float [[TMP70]], float [[TMP71]], float [[ACC45]]) +; CHECK-NEXT: [[ADD76]] = add nuw nsw i32 [[ADD76310]], 8 +; CHECK-NEXT: [[LSR_IV_NEXT152]] = add nuw i32 [[LSR_IV151]], 32 +; CHECK-NEXT: [[LSR_IV_NEXT168]] = add i32 [[LSR_IV167]], 8 +; CHECK-NEXT: [[EXITCOND84_NOT_7:%.*]] = icmp sgt i32 [[ADD76]], [[TMP4]] +; CHECK-NEXT: br i1 [[EXITCOND84_NOT_7]], label [[FOR_COND130_PREHEADER]], label [[FOR_BODY27_7]] +; CHECK: for.body133.lr.ph: +; CHECK-NEXT: [[TMP73:%.*]] = load ptr, ptr [[FIR]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = load ptr, ptr [[DELAY]], align 4 +; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP4]], [[N_0_LCSSA_CLONE]] +; CHECK-NEXT: [[TMP76:%.*]] = shl i32 [[N_0_LCSSA_CLONE]], 2 +; CHECK-NEXT: [[SCEVGEP194:%.*]] = getelementptr i8, ptr [[TMP74]], i32 [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = shl i32 [[COEFF_POS_0_LCSSA_CLONE]], 2 +; CHECK-NEXT: [[SCEVGEP197:%.*]] = getelementptr i8, ptr [[TMP73]], i32 [[TMP77]] +; CHECK-NEXT: br label [[FOR_BODY27_CLONE:%.*]] +; CHECK: for.body27.clone: +; CHECK-NEXT: [[LSR_IV198:%.*]] = phi ptr [ [[SCEVGEP199:%.*]], [[FOR_BODY27_CLONE]] ], [ [[SCEVGEP197]], [[FOR_BODY133_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV195:%.*]] = phi ptr [ [[SCEVGEP196:%.*]], [[FOR_BODY27_CLONE]] ], [ [[SCEVGEP194]], [[FOR_BODY133_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV192:%.*]] = phi i32 [ [[LSR_IV_NEXT193:%.*]], [[FOR_BODY27_CLONE]] ], [ [[TMP75]], [[FOR_BODY133_LR_PH]] ] +; CHECK-NEXT: [[ACC_173_CLONE:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_BODY133_LR_PH]] ], [ [[TMP80:%.*]], [[FOR_BODY27_CLONE]] ] +; CHECK-NEXT: [[TMP78:%.*]] = load float, ptr [[LSR_IV198]], align 4 +; CHECK-NEXT: [[TMP79:%.*]] = load float, ptr [[LSR_IV195]], align 4 +; CHECK-NEXT: [[TMP80]] = tail call float @llvm.fmuladd.f32(float [[TMP78]], float [[TMP79]], float [[ACC_173_CLONE]]) +; CHECK-NEXT: [[LSR_IV_NEXT193]] = add i32 [[LSR_IV192]], -1 +; CHECK-NEXT: [[SCEVGEP196]] = getelementptr i8, ptr [[LSR_IV195]], i32 4 +; CHECK-NEXT: [[SCEVGEP199]] = getelementptr i8, ptr [[LSR_IV198]], i32 4 +; CHECK-NEXT: [[EXITCOND84_NOT_CLONE:%.*]] = icmp eq i32 [[LSR_IV_NEXT193]], 0 +; CHECK-NEXT: br i1 [[EXITCOND84_NOT_CLONE]], label [[FOR_END141]], label [[FOR_BODY27_CLONE]] +; CHECK: for.end141: +; CHECK-NEXT: [[ACC0_3_LCSSA:%.*]] = phi float [ [[ACC_0_LCSSA_CLONE]], [[FOR_COND130_PREHEADER]] ], [ [[TMP80]], [[FOR_BODY27_CLONE]] ] +; CHECK-NEXT: [[ADD60:%.*]] = fadd float [[ACC_1_LCSSA2_CLONE]], [[ACC0_3_LCSSA]] +; CHECK-NEXT: [[ADD6179:%.*]] = fadd float [[ACC_2_LCSSA_CLONE]], [[ACC_3_LCSSA_CLONE]] +; CHECK-NEXT: [[ADD62:%.*]] = fadd float [[ACC_4_LCSSA_CLONE]], [[ACC_5_LCSSA_CLONE]] +; CHECK-NEXT: [[ADD6380:%.*]] = fadd float [[ACC_6_LCSSA_CLONE]], [[ACC_7_LCSSA_CLONE]] +; CHECK-NEXT: [[ADD64:%.*]] = fadd float [[ADD6179]], [[ADD60]] +; CHECK-NEXT: [[ADD6581:%.*]] = fadd float [[ADD62]], [[ADD6380]] +; CHECK-NEXT: [[ADD66:%.*]] = fadd float [[ADD6581]], [[ADD64]] ; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_080]] -; CHECK-NEXT: store float [[ACC_1_LCSSA]], ptr [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[EXITCOND85_NOT:%.*]] = icmp eq i32 [[INC39]], [[LEN]] -; CHECK-NEXT: br i1 [[EXITCOND85_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]] -; CHECK: for.body27: -; CHECK-NEXT: [[N22_075:%.*]] = phi i32 [ 0, [[FOR_BODY27_LR_PH]] ], [ [[INC34:%.*]], [[FOR_BODY27]] ] -; CHECK-NEXT: [[COEFF_POS_174:%.*]] = phi i32 [ [[COEFF_POS_0_LCSSA]], [[FOR_BODY27_LR_PH]] ], [ [[INC29:%.*]], [[FOR_BODY27]] ] -; CHECK-NEXT: [[ACC_173:%.*]] = phi float [ [[ACC_0_LCSSA]], [[FOR_BODY27_LR_PH]] ], [ [[TMP17]], [[FOR_BODY27]] ] -; CHECK-NEXT: [[INC29]] = add nuw nsw i32 [[COEFF_POS_174]], 1 -; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[COEFF_POS_174]] -; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX30]], align 4 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 [[N22_075]] -; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[TMP17]] = tail call float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[ACC_173]]) -; CHECK-NEXT: [[INC34]] = add nuw nsw i32 [[N22_075]], 1 -; CHECK-NEXT: [[EXITCOND84_NOT:%.*]] = icmp eq i32 [[INC34]], [[TMP4]] -; CHECK-NEXT: br i1 [[EXITCOND84_NOT]], label [[FOR_COND_CLEANUP26]], label [[FOR_BODY27]] +; CHECK-NEXT: store float [[ADD66]], ptr [[ARRAYIDX37]], align 4 +; CHECK-NEXT: [[INC152]] = add nuw nsw i32 [[I_080]], 1 +; CHECK-NEXT: [[EXITCOND350_NOT:%.*]] = icmp eq i32 [[INC152]], [[LEN]] +; CHECK-NEXT: br i1 [[EXITCOND350_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]] ; entry: %cmp77 = icmp sgt i32 %len, 0 diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll index a4fb7808a4f8ee..aa9f66e46f4e89 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/loopsecvconstant.ll @@ -1,23 +1,79 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local float @test_loop(ptr nocapture noundef readonly %data1, ptr nocapture noundef readonly %data2) local_unnamed_addr { ; CHECK-LABEL: define dso_local float @test_loop( -; CHECK-SAME: ptr nocapture noundef readonly [[DATA1:%.*]], ptr nocapture noundef readonly [[DATA2:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias nocapture noundef readonly [[DATA1:%.*]], ptr noalias nocapture noundef readonly [[DATA2:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret float [[TMP2:%.*]] +; CHECK: for.end: +; CHECK-NEXT: [[ADD37:%.*]] = fadd float [[TMP16:%.*]], [[TMP17:%.*]] +; CHECK-NEXT: [[ADD38:%.*]] = fadd float [[TMP18:%.*]], [[TMP19:%.*]] +; CHECK-NEXT: [[ADD39:%.*]] = fadd float [[TMP20:%.*]], [[TMP21:%.*]] +; CHECK-NEXT: [[ADD40:%.*]] = fadd float [[TMP22:%.*]], [[TMP23:%.*]] +; CHECK-NEXT: [[ADD41:%.*]] = fadd float [[ADD37]], [[ADD38]] +; CHECK-NEXT: [[ADD42:%.*]] = fadd float [[ADD39]], [[ADD40]] +; CHECK-NEXT: [[ADD43:%.*]] = fadd float [[ADD41]], [[ADD42]] +; CHECK-NEXT: ret float [[ADD43]] ; CHECK: for.body: -; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[RESULT_06:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP2]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI4:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP20]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI6:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP22]], [[FOR_BODY]] ] +; CHECK-NEXT: [[DOTPHI7:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INC_7]] = add nuw nsw i32 [[I_07]], 8 +; CHECK-NEXT: [[ADD:%.*]] = or disjoint i32 [[I_07]], 1 +; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i32 [[I_07]], 2 +; CHECK-NEXT: [[ADD11:%.*]] = or disjoint i32 [[I_07]], 3 +; CHECK-NEXT: [[ADD13:%.*]] = or disjoint i32 [[I_07]], 4 +; CHECK-NEXT: [[ADD15:%.*]] = or disjoint i32 [[I_07]], 5 +; CHECK-NEXT: [[ADD17:%.*]] = or disjoint i32 [[I_07]], 6 +; CHECK-NEXT: [[ADD19:%.*]] = or disjoint i32 [[I_07]], 7 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[I_07]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[I_07]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX1_1:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD9]] +; CHECK-NEXT: [[ARRAYIDX1_2:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD9]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD11]] +; CHECK-NEXT: [[ARRAYIDX1_3:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD11]] +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD13]] +; CHECK-NEXT: [[ARRAYIDX1_4:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD13]] +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD15]] +; CHECK-NEXT: [[ARRAYIDX1_5:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD15]] +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD17]] +; CHECK-NEXT: [[ARRAYIDX1_6:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD17]] +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i32 [[ADD19]] +; CHECK-NEXT: [[ARRAYIDX1_7:%.*]] = getelementptr inbounds float, ptr [[DATA2]], i32 [[ADD19]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[RESULT_06]]) -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX1_1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX1_2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX1_3]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX_4]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1_4]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX_5]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX1_5]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX_6]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX1_6]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX_7]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX1_7]], align 4 +; CHECK-NEXT: [[TMP16]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[DOTPHI]]) +; CHECK-NEXT: [[TMP17]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float [[TMP3]], float [[DOTPHI1]]) +; CHECK-NEXT: [[TMP18]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP5]], float [[DOTPHI2]]) +; CHECK-NEXT: [[TMP19]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[DOTPHI3]]) +; CHECK-NEXT: [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[DOTPHI4]]) +; CHECK-NEXT: [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP11]], float [[DOTPHI5]]) +; CHECK-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float [[TMP13]], float [[DOTPHI6]]) +; CHECK-NEXT: [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP14]], float [[TMP15]], float [[DOTPHI7]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC_7]], 1009 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; entry: br label %for.body diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll index bcf9852fd491ee..1a6c4fda2b5126 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mul.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, ptr noundef readonly %input2, ptr noundef writeonly %output, i32 noundef %len, i32 noundef %step1, i32 noundef %step2, i32 noundef %step_out) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_mul_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[INPUT1:%.*]], ptr noundef readonly [[INPUT2:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[INPUT1:%.*]], ptr noalias noundef readonly [[INPUT2:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[INPUT1]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[INPUT2]], null @@ -12,19 +12,159 @@ define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: br i1 [[OR_COND20]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: ; CHECK-NEXT: [[CMP721:%.*]] = icmp sgt i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[CMP721]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]] +; CHECK: for.cond.preheader.new: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[LEN]], -16 +; CHECK-NEXT: [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16 +; CHECK-NEXT: br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]] +; CHECK: for.cond.preheader.new2: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]] +; CHECK: for.body.modify: +; CHECK-NEXT: [[I_022_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[TMP1]] = add nuw i32 [[I_022_MODIFY]], 16 +; CHECK-NEXT: [[ADD:%.*]] = or disjoint i32 [[I_022_MODIFY]], 1 +; CHECK-NEXT: [[ADD4:%.*]] = or disjoint i32 [[I_022_MODIFY]], 2 +; CHECK-NEXT: [[ADD8:%.*]] = or disjoint i32 [[I_022_MODIFY]], 3 +; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[I_022_MODIFY]], 4 +; CHECK-NEXT: [[ADD18:%.*]] = or disjoint i32 [[I_022_MODIFY]], 5 +; CHECK-NEXT: [[ADD22:%.*]] = or disjoint i32 [[I_022_MODIFY]], 6 +; CHECK-NEXT: [[ADD26:%.*]] = or disjoint i32 [[I_022_MODIFY]], 7 +; CHECK-NEXT: [[ADD30:%.*]] = or disjoint i32 [[I_022_MODIFY]], 8 +; CHECK-NEXT: [[ADD34:%.*]] = or disjoint i32 [[I_022_MODIFY]], 9 +; CHECK-NEXT: [[ADD38:%.*]] = or disjoint i32 [[I_022_MODIFY]], 10 +; CHECK-NEXT: [[ADD42:%.*]] = or disjoint i32 [[I_022_MODIFY]], 11 +; CHECK-NEXT: [[ADD46:%.*]] = or disjoint i32 [[I_022_MODIFY]], 12 +; CHECK-NEXT: [[ADD50:%.*]] = or disjoint i32 [[I_022_MODIFY]], 13 +; CHECK-NEXT: [[ADD54:%.*]] = or disjoint i32 [[I_022_MODIFY]], 14 +; CHECK-NEXT: [[ADD58:%.*]] = or disjoint i32 [[I_022_MODIFY]], 15 +; CHECK-NEXT: [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[I_022_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX9_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[I_022_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX12_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_022_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD4]] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD4]] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD8]] +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD8]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD8]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX53:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_MODIFY]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX23]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX36]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX40]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX43]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX47]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = load float, ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX52]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX55]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX56]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX59]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[MUL10_MODIFY:%.*]] = fmul float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP34:%.*]] = fmul float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP35:%.*]] = fmul float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP36:%.*]] = fmul float [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP37:%.*]] = fmul float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP38:%.*]] = fmul float [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP39:%.*]] = fmul float [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP40:%.*]] = fmul float [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP41:%.*]] = fmul float [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP42:%.*]] = fmul float [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP43:%.*]] = fmul float [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP44:%.*]] = fmul float [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[TMP45:%.*]] = fmul float [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[TMP46:%.*]] = fmul float [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[TMP47:%.*]] = fmul float [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP48:%.*]] = fmul float [[TMP32]], [[TMP33]] +; CHECK-NEXT: store float [[MUL10_MODIFY]], ptr [[ARRAYIDX12_MODIFY]], align 4 +; CHECK-NEXT: store float [[TMP34]], ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: store float [[TMP35]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: store float [[TMP36]], ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: store float [[TMP37]], ptr [[ARRAYIDX17]], align 4 +; CHECK-NEXT: store float [[TMP38]], ptr [[ARRAYIDX21]], align 4 +; CHECK-NEXT: store float [[TMP39]], ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: store float [[TMP40]], ptr [[ARRAYIDX29]], align 4 +; CHECK-NEXT: store float [[TMP41]], ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: store float [[TMP42]], ptr [[ARRAYIDX37]], align 4 +; CHECK-NEXT: store float [[TMP43]], ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: store float [[TMP44]], ptr [[ARRAYIDX45]], align 4 +; CHECK-NEXT: store float [[TMP45]], ptr [[ARRAYIDX49]], align 4 +; CHECK-NEXT: store float [[TMP46]], ptr [[ARRAYIDX53]], align 4 +; CHECK-NEXT: store float [[TMP47]], ptr [[ARRAYIDX57]], align 4 +; CHECK-NEXT: store float [[TMP48]], ptr [[ARRAYIDX61]], align 4 +; CHECK-NEXT: [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]] ; CHECK: for.body: -; CHECK-NEXT: [[I_022:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ] +; CHECK-NEXT: [[I_022:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_022]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL8:%.*]] = mul nsw i32 [[I_022]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[MUL10:%.*]] = fmul float [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP50:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[MUL10:%.*]] = fmul float [[TMP49]], [[TMP50]] ; CHECK-NEXT: [[MUL11:%.*]] = mul nsw i32 [[I_022]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL11]] ; CHECK-NEXT: store float [[MUL10]], ptr [[ARRAYIDX12]], align 4 @@ -35,11 +175,11 @@ define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: [[I_022_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[MUL_CLONE:%.*]] = mul nsw i32 [[I_022_CLONE]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL_CLONE]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 ; CHECK-NEXT: [[MUL8_CLONE:%.*]] = mul nsw i32 [[I_022_CLONE]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX9_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8_CLONE]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4 -; CHECK-NEXT: [[MUL10_CLONE:%.*]] = fmul float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4 +; CHECK-NEXT: [[MUL10_CLONE:%.*]] = fmul float [[TMP51]], [[TMP52]] ; CHECK-NEXT: [[MUL11_CLONE:%.*]] = mul nsw i32 [[I_022_CLONE]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX12_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL11_CLONE]] ; CHECK-NEXT: store float [[MUL10_CLONE]], ptr [[ARRAYIDX12_CLONE]], align 4 @@ -47,7 +187,7 @@ define dso_local noundef i32 @dsps_mul_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll index 2c81f5bfd4b6f3..bf4e757def1373 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/mulc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %input, ptr noalias noundef writeonly %output, i32 noundef %len, float noundef %C, i32 noundef %step_in, i32 noundef %step_out) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_mulc_f32_ansi( ; CHECK-SAME: ptr noalias noundef readonly [[INPUT:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], float noundef [[C:%.*]], i32 noundef [[STEP_IN:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { @@ -10,16 +10,124 @@ define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %i ; CHECK-NEXT: br i1 [[OR_COND]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: ; CHECK-NEXT: [[CMP413:%.*]] = icmp sgt i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[CMP413]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]] +; CHECK: for.cond.preheader.new: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[LEN]], -16 +; CHECK-NEXT: [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16 +; CHECK-NEXT: br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]] +; CHECK: for.cond.preheader.new2: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]] +; CHECK: for.body.modify: +; CHECK-NEXT: [[I_014_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[TMP1]] = add nuw i32 [[I_014_MODIFY]], 16 +; CHECK-NEXT: [[ADD:%.*]] = or disjoint i32 [[I_014_MODIFY]], 1 +; CHECK-NEXT: [[ADD3:%.*]] = or disjoint i32 [[I_014_MODIFY]], 2 +; CHECK-NEXT: [[ADD6:%.*]] = or disjoint i32 [[I_014_MODIFY]], 3 +; CHECK-NEXT: [[ADD10:%.*]] = or disjoint i32 [[I_014_MODIFY]], 4 +; CHECK-NEXT: [[ADD13:%.*]] = or disjoint i32 [[I_014_MODIFY]], 5 +; CHECK-NEXT: [[ADD16:%.*]] = or disjoint i32 [[I_014_MODIFY]], 6 +; CHECK-NEXT: [[ADD19:%.*]] = or disjoint i32 [[I_014_MODIFY]], 7 +; CHECK-NEXT: [[ADD22:%.*]] = or disjoint i32 [[I_014_MODIFY]], 8 +; CHECK-NEXT: [[ADD25:%.*]] = or disjoint i32 [[I_014_MODIFY]], 9 +; CHECK-NEXT: [[ADD28:%.*]] = or disjoint i32 [[I_014_MODIFY]], 10 +; CHECK-NEXT: [[ADD31:%.*]] = or disjoint i32 [[I_014_MODIFY]], 11 +; CHECK-NEXT: [[ADD34:%.*]] = or disjoint i32 [[I_014_MODIFY]], 12 +; CHECK-NEXT: [[ADD37:%.*]] = or disjoint i32 [[I_014_MODIFY]], 13 +; CHECK-NEXT: [[ADD40:%.*]] = or disjoint i32 [[I_014_MODIFY]], 14 +; CHECK-NEXT: [[ADD43:%.*]] = or disjoint i32 [[I_014_MODIFY]], 15 +; CHECK-NEXT: [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_014_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX7_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_014_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD3]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD3]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD6]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD6]] +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD10]] +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD10]] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD13]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD13]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD16]] +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD16]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD19]] +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD19]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD25]] +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD25]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD28]] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD28]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD31]] +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD31]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD37]] +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD37]] +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD40]] +; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD40]] +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD43]] +; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD43]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX17]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX23]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX29]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX38]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[MUL5_MODIFY:%.*]] = fmul float [[C]], [[TMP2]] +; CHECK-NEXT: [[TMP18:%.*]] = fmul float [[C]], [[TMP3]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul float [[C]], [[TMP4]] +; CHECK-NEXT: [[TMP20:%.*]] = fmul float [[C]], [[TMP5]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul float [[C]], [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = fmul float [[C]], [[TMP7]] +; CHECK-NEXT: [[TMP23:%.*]] = fmul float [[C]], [[TMP8]] +; CHECK-NEXT: [[TMP24:%.*]] = fmul float [[C]], [[TMP9]] +; CHECK-NEXT: [[TMP25:%.*]] = fmul float [[C]], [[TMP10]] +; CHECK-NEXT: [[TMP26:%.*]] = fmul float [[C]], [[TMP11]] +; CHECK-NEXT: [[TMP27:%.*]] = fmul float [[C]], [[TMP12]] +; CHECK-NEXT: [[TMP28:%.*]] = fmul float [[C]], [[TMP13]] +; CHECK-NEXT: [[TMP29:%.*]] = fmul float [[C]], [[TMP14]] +; CHECK-NEXT: [[TMP30:%.*]] = fmul float [[C]], [[TMP15]] +; CHECK-NEXT: [[TMP31:%.*]] = fmul float [[C]], [[TMP16]] +; CHECK-NEXT: [[TMP32:%.*]] = fmul float [[C]], [[TMP17]] +; CHECK-NEXT: store float [[MUL5_MODIFY]], ptr [[ARRAYIDX7_MODIFY]], align 4 +; CHECK-NEXT: store float [[TMP18]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: store float [[TMP19]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: store float [[TMP20]], ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: store float [[TMP21]], ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: store float [[TMP22]], ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: store float [[TMP23]], ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: store float [[TMP24]], ptr [[ARRAYIDX21]], align 4 +; CHECK-NEXT: store float [[TMP25]], ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: store float [[TMP26]], ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: store float [[TMP27]], ptr [[ARRAYIDX30]], align 4 +; CHECK-NEXT: store float [[TMP28]], ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: store float [[TMP29]], ptr [[ARRAYIDX36]], align 4 +; CHECK-NEXT: store float [[TMP30]], ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: store float [[TMP31]], ptr [[ARRAYIDX42]], align 4 +; CHECK-NEXT: store float [[TMP32]], ptr [[ARRAYIDX45]], align 4 +; CHECK-NEXT: [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]] ; CHECK: for.body: -; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_014]], [[STEP_IN]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[MUL5:%.*]] = fmul float [[TMP0]], [[C]] +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL5:%.*]] = fmul float [[C]], [[TMP33]] ; CHECK-NEXT: [[MUL6:%.*]] = mul nsw i32 [[I_014]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL6]] ; CHECK-NEXT: store float [[MUL5]], ptr [[ARRAYIDX7]], align 4 @@ -30,8 +138,8 @@ define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %i ; CHECK-NEXT: [[I_014_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[MUL_CLONE:%.*]] = mul nsw i32 [[I_014_CLONE]], [[STEP_IN]] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[MUL_CLONE]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 -; CHECK-NEXT: [[MUL5_CLONE:%.*]] = fmul float [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[MUL5_CLONE:%.*]] = fmul float [[C]], [[TMP34]] ; CHECK-NEXT: [[MUL6_CLONE:%.*]] = mul nsw i32 [[I_014_CLONE]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX7_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL6_CLONE]] ; CHECK-NEXT: store float [[MUL5_CLONE]], ptr [[ARRAYIDX7_CLONE]], align 4 @@ -39,7 +147,7 @@ define dso_local noundef i32 @dsps_mulc_f32_ansi(ptr noalias noundef readonly %i ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll index 99ac2877f76c6b..89c891af406691 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sqrt.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, ptr noundef writeonly %output, i32 noundef %len) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_sqrt_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[INPUT:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[INPUT:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[INPUT]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[OUTPUT]], null @@ -10,15 +10,139 @@ define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, pt ; CHECK-NEXT: br i1 [[OR_COND]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: ; CHECK-NEXT: [[CMP411:%.*]] = icmp sgt i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[CMP411]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]] +; CHECK: for.cond.preheader.new: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[LEN]], -16 +; CHECK-NEXT: [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16 +; CHECK-NEXT: br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]] +; CHECK: for.cond.preheader.new2: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP32:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]] +; CHECK: for.body.modify: +; CHECK-NEXT: [[I_012_MODIFY:%.*]] = phi i32 [ [[TMP32]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_012_MODIFY]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_MODIFY]], align 4 +; CHECK-NEXT: [[SHR_I_MODIFY:%.*]] = ashr i32 [[TMP1]], 1 +; CHECK-NEXT: [[ADD48:%.*]] = or disjoint i32 [[SHR_I_MODIFY]], 532365312 +; CHECK-NEXT: [[ARRAYIDX5_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_012_MODIFY]] +; CHECK-NEXT: store i32 [[ADD48]], ptr [[ARRAYIDX5_MODIFY]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = or disjoint i32 [[I_012_MODIFY]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP2]], 1 +; CHECK-NEXT: [[ADD50:%.*]] = or disjoint i32 [[TMP3]], 532365312 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]] +; CHECK-NEXT: store i32 [[ADD50]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = or disjoint i32 [[I_012_MODIFY]], 2 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD3]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ashr i32 [[TMP4]], 1 +; CHECK-NEXT: [[ADD52:%.*]] = or disjoint i32 [[TMP5]], 532365312 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD3]] +; CHECK-NEXT: store i32 [[ADD52]], ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = or disjoint i32 [[I_012_MODIFY]], 3 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD7]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ashr i32 [[TMP6]], 1 +; CHECK-NEXT: [[ADD54:%.*]] = or disjoint i32 [[TMP7]], 532365312 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD7]] +; CHECK-NEXT: store i32 [[ADD54]], ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ADD10:%.*]] = or disjoint i32 [[I_012_MODIFY]], 4 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD10]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = ashr i32 [[TMP8]], 1 +; CHECK-NEXT: [[ADD56:%.*]] = or disjoint i32 [[TMP9]], 532365312 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD10]] +; CHECK-NEXT: store i32 [[ADD56]], ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[ADD13:%.*]] = or disjoint i32 [[I_012_MODIFY]], 5 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD13]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = ashr i32 [[TMP10]], 1 +; CHECK-NEXT: [[ADD58:%.*]] = or disjoint i32 [[TMP11]], 532365312 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD13]] +; CHECK-NEXT: store i32 [[ADD58]], ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[ADD16:%.*]] = or disjoint i32 [[I_012_MODIFY]], 6 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD16]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = ashr i32 [[TMP12]], 1 +; CHECK-NEXT: [[ADD60:%.*]] = or disjoint i32 [[TMP13]], 532365312 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD16]] +; CHECK-NEXT: store i32 [[ADD60]], ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[ADD19:%.*]] = or disjoint i32 [[I_012_MODIFY]], 7 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD19]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = ashr i32 [[TMP14]], 1 +; CHECK-NEXT: [[ADD62:%.*]] = or disjoint i32 [[TMP15]], 532365312 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD19]] +; CHECK-NEXT: store i32 [[ADD62]], ptr [[ARRAYIDX21]], align 4 +; CHECK-NEXT: [[ADD22:%.*]] = or disjoint i32 [[I_012_MODIFY]], 8 +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD22]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = ashr i32 [[TMP16]], 1 +; CHECK-NEXT: [[ADD64:%.*]] = or disjoint i32 [[TMP17]], 532365312 +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]] +; CHECK-NEXT: store i32 [[ADD64]], ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[ADD25:%.*]] = or disjoint i32 [[I_012_MODIFY]], 9 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD25]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = ashr i32 [[TMP18]], 1 +; CHECK-NEXT: [[ADD66:%.*]] = or disjoint i32 [[TMP19]], 532365312 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD25]] +; CHECK-NEXT: store i32 [[ADD66]], ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[ADD28:%.*]] = or disjoint i32 [[I_012_MODIFY]], 10 +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD28]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX29]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = ashr i32 [[TMP20]], 1 +; CHECK-NEXT: [[ADD68:%.*]] = or disjoint i32 [[TMP21]], 532365312 +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD28]] +; CHECK-NEXT: store i32 [[ADD68]], ptr [[ARRAYIDX30]], align 4 +; CHECK-NEXT: [[ADD31:%.*]] = or disjoint i32 [[I_012_MODIFY]], 11 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD31]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = ashr i32 [[TMP22]], 1 +; CHECK-NEXT: [[ADD70:%.*]] = or disjoint i32 [[TMP23]], 532365312 +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD31]] +; CHECK-NEXT: store i32 [[ADD70]], ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: [[ADD34:%.*]] = or disjoint i32 [[I_012_MODIFY]], 12 +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD34]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = ashr i32 [[TMP24]], 1 +; CHECK-NEXT: [[ADD72:%.*]] = or disjoint i32 [[TMP25]], 532365312 +; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]] +; CHECK-NEXT: store i32 [[ADD72]], ptr [[ARRAYIDX36]], align 4 +; CHECK-NEXT: [[ADD37:%.*]] = or disjoint i32 [[I_012_MODIFY]], 13 +; CHECK-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD37]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX38]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = ashr i32 [[TMP26]], 1 +; CHECK-NEXT: [[ADD74:%.*]] = or disjoint i32 [[TMP27]], 532365312 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD37]] +; CHECK-NEXT: store i32 [[ADD74]], ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[ADD40:%.*]] = or disjoint i32 [[I_012_MODIFY]], 14 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD40]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = ashr i32 [[TMP28]], 1 +; CHECK-NEXT: [[ADD76:%.*]] = or disjoint i32 [[TMP29]], 532365312 +; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD40]] +; CHECK-NEXT: store i32 [[ADD76]], ptr [[ARRAYIDX42]], align 4 +; CHECK-NEXT: [[ADD43:%.*]] = or disjoint i32 [[I_012_MODIFY]], 15 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[ADD43]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = ashr i32 [[TMP30]], 1 +; CHECK-NEXT: [[ADD78:%.*]] = or disjoint i32 [[TMP31]], 532365312 +; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD43]] +; CHECK-NEXT: store i32 [[ADD78]], ptr [[ARRAYIDX45]], align 4 +; CHECK-NEXT: [[TMP32]] = add nuw i32 [[I_012_MODIFY]], 16 +; CHECK-NEXT: [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP32]], [[SUB]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]] ; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ] +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_012]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[TMP33]], 1 ; CHECK-NEXT: [[ADD_I:%.*]] = add nsw i32 [[SHR_I]], 532365312 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_012]] ; CHECK-NEXT: store i32 [[ADD_I]], ptr [[ARRAYIDX5]], align 4 @@ -28,8 +152,8 @@ define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, pt ; CHECK: for.body.clone: ; CHECK-NEXT: [[I_012_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i32 [[I_012_CLONE]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_CLONE]], align 4 -; CHECK-NEXT: [[SHR_I_CLONE:%.*]] = ashr i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[SHR_I_CLONE:%.*]] = ashr i32 [[TMP34]], 1 ; CHECK-NEXT: [[ADD_I_CLONE:%.*]] = add nsw i32 [[SHR_I_CLONE]], 532365312 ; CHECK-NEXT: [[ARRAYIDX5_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_012_CLONE]] ; CHECK-NEXT: store i32 [[ADD_I_CLONE]], ptr [[ARRAYIDX5_CLONE]], align 4 @@ -37,7 +161,7 @@ define dso_local noundef i32 @dsps_sqrt_f32_ansi(ptr noundef readonly %input, pt ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: diff --git a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll index 9468a11ba62329..19bca2d13e120e 100644 --- a/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll +++ b/llvm/test/CodeGen/RISCV/RISCVLoopUnrollAndRemainder/sub.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=false < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv32-esp-unknown-elf -passes=riscv-loop-unroll-and-remainder -riscv-loop-unroll-and-remainder=true < %s | FileCheck %s define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, ptr noundef readonly %input2, ptr noundef writeonly %output, i32 noundef %len, i32 noundef %step1, i32 noundef %step2, i32 noundef %step_out) local_unnamed_addr { ; CHECK-LABEL: define dso_local noundef i32 @dsps_sub_f32_ansi( -; CHECK-SAME: ptr noundef readonly [[INPUT1:%.*]], ptr noundef readonly [[INPUT2:%.*]], ptr noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: ptr noalias noundef readonly [[INPUT1:%.*]], ptr noalias noundef readonly [[INPUT2:%.*]], ptr noalias noundef writeonly [[OUTPUT:%.*]], i32 noundef [[LEN:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]], i32 noundef [[STEP_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[INPUT1]], null ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[INPUT2]], null @@ -12,19 +12,159 @@ define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: br i1 [[OR_COND19]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[LEN]], 2 -; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_COND_PREHEADER_NEW:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: ; CHECK-NEXT: [[CMP720:%.*]] = icmp sgt i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[CMP720]], label [[FOR_BODY_CLONE:%.*]], label [[RETURN]] +; CHECK: for.cond.preheader.new: +; CHECK-NEXT: [[SUB63:%.*]] = add nsw i32 [[LEN]], -16 +; CHECK-NEXT: [[CMP6_NOT207:%.*]] = icmp ult i32 [[LEN]], 16 +; CHECK-NEXT: br i1 [[CMP6_NOT207]], label [[FOR_COND_PREHEADER_NEW2:%.*]], label [[FOR_BODY_MODIFY:%.*]] +; CHECK: for.cond.preheader.new2: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[CMP85209:%.*]] = icmp slt i32 [[TMP0]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP85209]], label [[FOR_BODY:%.*]], label [[RETURN]] +; CHECK: for.body.modify: +; CHECK-NEXT: [[I_021_MODIFY:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY_MODIFY]] ], [ 0, [[FOR_COND_PREHEADER_NEW]] ] +; CHECK-NEXT: [[TMP1]] = add nuw i32 [[I_021_MODIFY]], 16 +; CHECK-NEXT: [[ADD:%.*]] = or disjoint i32 [[I_021_MODIFY]], 1 +; CHECK-NEXT: [[ADD4:%.*]] = or disjoint i32 [[I_021_MODIFY]], 2 +; CHECK-NEXT: [[ADD8:%.*]] = or disjoint i32 [[I_021_MODIFY]], 3 +; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[I_021_MODIFY]], 4 +; CHECK-NEXT: [[ADD18:%.*]] = or disjoint i32 [[I_021_MODIFY]], 5 +; CHECK-NEXT: [[ADD22:%.*]] = or disjoint i32 [[I_021_MODIFY]], 6 +; CHECK-NEXT: [[ADD26:%.*]] = or disjoint i32 [[I_021_MODIFY]], 7 +; CHECK-NEXT: [[ADD30:%.*]] = or disjoint i32 [[I_021_MODIFY]], 8 +; CHECK-NEXT: [[ADD34:%.*]] = or disjoint i32 [[I_021_MODIFY]], 9 +; CHECK-NEXT: [[ADD38:%.*]] = or disjoint i32 [[I_021_MODIFY]], 10 +; CHECK-NEXT: [[ADD42:%.*]] = or disjoint i32 [[I_021_MODIFY]], 11 +; CHECK-NEXT: [[ADD46:%.*]] = or disjoint i32 [[I_021_MODIFY]], 12 +; CHECK-NEXT: [[ADD50:%.*]] = or disjoint i32 [[I_021_MODIFY]], 13 +; CHECK-NEXT: [[ADD54:%.*]] = or disjoint i32 [[I_021_MODIFY]], 14 +; CHECK-NEXT: [[ADD58:%.*]] = or disjoint i32 [[I_021_MODIFY]], 15 +; CHECK-NEXT: [[ARRAYIDX_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[I_021_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX9_MODIFY:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[I_021_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX11_MODIFY:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[I_021_MODIFY]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD4]] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD4]] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD8]] +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD8]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD8]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD14]] +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD18]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD22]] +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD26]] +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD30]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD34]] +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD38]] +; CHECK-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD42]] +; CHECK-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD46]] +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX53:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD50]] +; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD54]] +; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[ADD58]] +; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[ADD58]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_MODIFY]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_MODIFY]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX23]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX36]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX40]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX43]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX47]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = load float, ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX52]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX55]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX56]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX59]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[SUB_MODIFY:%.*]] = fsub float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP34:%.*]] = fsub float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP35:%.*]] = fsub float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP36:%.*]] = fsub float [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP37:%.*]] = fsub float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP38:%.*]] = fsub float [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP39:%.*]] = fsub float [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP40:%.*]] = fsub float [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP41:%.*]] = fsub float [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP42:%.*]] = fsub float [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP43:%.*]] = fsub float [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP44:%.*]] = fsub float [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[TMP45:%.*]] = fsub float [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[TMP46:%.*]] = fsub float [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[TMP47:%.*]] = fsub float [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP48:%.*]] = fsub float [[TMP32]], [[TMP33]] +; CHECK-NEXT: store float [[SUB_MODIFY]], ptr [[ARRAYIDX11_MODIFY]], align 4 +; CHECK-NEXT: store float [[TMP34]], ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: store float [[TMP35]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: store float [[TMP36]], ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: store float [[TMP37]], ptr [[ARRAYIDX17]], align 4 +; CHECK-NEXT: store float [[TMP38]], ptr [[ARRAYIDX21]], align 4 +; CHECK-NEXT: store float [[TMP39]], ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: store float [[TMP40]], ptr [[ARRAYIDX29]], align 4 +; CHECK-NEXT: store float [[TMP41]], ptr [[ARRAYIDX33]], align 4 +; CHECK-NEXT: store float [[TMP42]], ptr [[ARRAYIDX37]], align 4 +; CHECK-NEXT: store float [[TMP43]], ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: store float [[TMP44]], ptr [[ARRAYIDX45]], align 4 +; CHECK-NEXT: store float [[TMP45]], ptr [[ARRAYIDX49]], align 4 +; CHECK-NEXT: store float [[TMP46]], ptr [[ARRAYIDX53]], align 4 +; CHECK-NEXT: store float [[TMP47]], ptr [[ARRAYIDX57]], align 4 +; CHECK-NEXT: store float [[TMP48]], ptr [[ARRAYIDX61]], align 4 +; CHECK-NEXT: [[EXITCOND_NOT_MODIFY:%.*]] = icmp sgt i32 [[TMP1]], [[SUB63]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_MODIFY]], label [[FOR_COND_PREHEADER_NEW2]], label [[FOR_BODY_MODIFY]] ; CHECK: for.body: -; CHECK-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_END]] ] +; CHECK-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_021]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL8:%.*]] = mul nsw i32 [[I_021]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP50:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[TMP49]], [[TMP50]] ; CHECK-NEXT: [[MUL10:%.*]] = mul nsw i32 [[I_021]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10]] ; CHECK-NEXT: store float [[SUB]], ptr [[ARRAYIDX11]], align 4 @@ -35,11 +175,11 @@ define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: [[I_021_CLONE:%.*]] = phi i32 [ [[INC_CLONE:%.*]], [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER]] ] ; CHECK-NEXT: [[MUL_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP1]] ; CHECK-NEXT: [[ARRAYIDX_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT1]], i32 [[MUL_CLONE]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[ARRAYIDX_CLONE]], align 4 ; CHECK-NEXT: [[MUL8_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP2]] ; CHECK-NEXT: [[ARRAYIDX9_CLONE:%.*]] = getelementptr inbounds float, ptr [[INPUT2]], i32 [[MUL8_CLONE]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4 -; CHECK-NEXT: [[SUB_CLONE:%.*]] = fsub float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[ARRAYIDX9_CLONE]], align 4 +; CHECK-NEXT: [[SUB_CLONE:%.*]] = fsub float [[TMP51]], [[TMP52]] ; CHECK-NEXT: [[MUL10_CLONE:%.*]] = mul nsw i32 [[I_021_CLONE]], [[STEP_OUT]] ; CHECK-NEXT: [[ARRAYIDX11_CLONE:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i32 [[MUL10_CLONE]] ; CHECK-NEXT: store float [[SUB_CLONE]], ptr [[ARRAYIDX11_CLONE]], align 4 @@ -47,7 +187,7 @@ define dso_local noundef i32 @dsps_sub_f32_ansi(ptr noundef readonly %input1, pt ; CHECK-NEXT: [[EXITCOND_NOT_CLONE:%.*]] = icmp eq i32 [[INC_CLONE]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_CLONE]], label [[RETURN]], label [[FOR_BODY_CLONE]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 458755, [[ENTRY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ], [ 0, [[FOR_BODY]] ], [ 0, [[FOR_BODY_CLONE]] ], [ 0, [[FOR_COND_PREHEADER_NEW2]] ] ; CHECK-NEXT: ret i32 [[RETVAL_0]] ; entry: