1234285Sdim//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===// 2234285Sdim// 3234285Sdim// The LLVM Compiler Infrastructure 4234285Sdim// 5234285Sdim// This file is distributed under the University of Illinois Open Source 6234285Sdim// License. See LICENSE.TXT for details. 7234285Sdim// 8234285Sdim//===----------------------------------------------------------------------===// 9234285Sdim// 10234285Sdim// This file implements some loop unrolling utilities for loops with run-time 11234285Sdim// trip counts. See LoopUnroll.cpp for unrolling loops with compile-time 12234285Sdim// trip counts. 13234285Sdim// 14234285Sdim// The functions in this file are used to generate extra code when the 15234285Sdim// run-time trip count modulo the unroll factor is not 0. When this is the 16234285Sdim// case, we need to generate code to execute these 'left over' iterations. 17234285Sdim// 18234285Sdim// The current strategy generates an if-then-else sequence prior to the 19234285Sdim// unrolled loop to execute the 'left over' iterations. Other strategies 20234285Sdim// include generate a loop before or after the unrolled loop. 21234285Sdim// 22234285Sdim//===----------------------------------------------------------------------===// 23234285Sdim 24234285Sdim#define DEBUG_TYPE "loop-unroll" 25234285Sdim#include "llvm/Transforms/Utils/UnrollLoop.h" 26234285Sdim#include "llvm/ADT/Statistic.h" 27234285Sdim#include "llvm/Analysis/LoopIterator.h" 28234285Sdim#include "llvm/Analysis/LoopPass.h" 29234285Sdim#include "llvm/Analysis/ScalarEvolution.h" 30234285Sdim#include "llvm/Analysis/ScalarEvolutionExpander.h" 31249423Sdim#include "llvm/IR/BasicBlock.h" 32234285Sdim#include "llvm/Support/Debug.h" 33234285Sdim#include "llvm/Support/raw_ostream.h" 34234285Sdim#include "llvm/Transforms/Utils/BasicBlockUtils.h" 35234285Sdim#include "llvm/Transforms/Utils/Cloning.h" 36234285Sdim#include <algorithm> 37234285Sdim 38234285Sdimusing namespace llvm; 39234285Sdim 40234285SdimSTATISTIC(NumRuntimeUnrolled, 41234285Sdim "Number of loops unrolled with run-time trip counts"); 42234285Sdim 43234285Sdim/// Connect the unrolling prolog code to the original loop. 44234285Sdim/// The unrolling prolog code contains code to execute the 45234285Sdim/// 'extra' iterations if the run-time trip count modulo the 46234285Sdim/// unroll count is non-zero. 47234285Sdim/// 48234285Sdim/// This function performs the following: 49234285Sdim/// - Create PHI nodes at prolog end block to combine values 50234285Sdim/// that exit the prolog code and jump around the prolog. 51234285Sdim/// - Add a PHI operand to a PHI node at the loop exit block 52234285Sdim/// for values that exit the prolog and go around the loop. 53234285Sdim/// - Branch around the original loop if the trip count is less 54234285Sdim/// than the unroll factor. 55234285Sdim/// 56234285Sdimstatic void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, 57234285Sdim BasicBlock *LastPrologBB, BasicBlock *PrologEnd, 58234285Sdim BasicBlock *OrigPH, BasicBlock *NewPH, 59234285Sdim ValueToValueMapTy &LVMap, Pass *P) { 60234285Sdim BasicBlock *Latch = L->getLoopLatch(); 61234285Sdim assert(Latch != 0 && "Loop must have a latch"); 62234285Sdim 63234285Sdim // Create a PHI node for each outgoing value from the original loop 64234285Sdim // (which means it is an outgoing value from the prolog code too). 65234285Sdim // The new PHI node is inserted in the prolog end basic block. 66234285Sdim // The new PHI name is added as an operand of a PHI node in either 67234285Sdim // the loop header or the loop exit block. 68234285Sdim for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch); 69234285Sdim SBI != SBE; ++SBI) { 70234285Sdim for (BasicBlock::iterator BBI = (*SBI)->begin(); 71234285Sdim PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) { 72234285Sdim 73234285Sdim // Add a new PHI node to the prolog end block and add the 74234285Sdim // appropriate incoming values. 75234285Sdim PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr", 76234285Sdim PrologEnd->getTerminator()); 77234285Sdim // Adding a value to the new PHI node from the original loop preheader. 78234285Sdim // This is the value that skips all the prolog code. 79234285Sdim if (L->contains(PN)) { 80234285Sdim NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH); 81234285Sdim } else { 82234285Sdim NewPN->addIncoming(Constant::getNullValue(PN->getType()), OrigPH); 83234285Sdim } 84234285Sdim 85234285Sdim Value *V = PN->getIncomingValueForBlock(Latch); 86234285Sdim if (Instruction *I = dyn_cast<Instruction>(V)) { 87234285Sdim if (L->contains(I)) { 88234285Sdim V = LVMap[I]; 89234285Sdim } 90234285Sdim } 91234285Sdim // Adding a value to the new PHI node from the last prolog block 92234285Sdim // that was created. 93234285Sdim NewPN->addIncoming(V, LastPrologBB); 94234285Sdim 95234285Sdim // Update the existing PHI node operand with the value from the 96234285Sdim // new PHI node. How this is done depends on if the existing 97234285Sdim // PHI node is in the original loop block, or the exit block. 98234285Sdim if (L->contains(PN)) { 99234285Sdim PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN); 100234285Sdim } else { 101234285Sdim PN->addIncoming(NewPN, PrologEnd); 102234285Sdim } 103234285Sdim } 104234285Sdim } 105234285Sdim 106234285Sdim // Create a branch around the orignal loop, which is taken if the 107234285Sdim // trip count is less than the unroll factor. 108234285Sdim Instruction *InsertPt = PrologEnd->getTerminator(); 109234285Sdim Instruction *BrLoopExit = 110234285Sdim new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount, 111234285Sdim ConstantInt::get(TripCount->getType(), Count)); 112234285Sdim BasicBlock *Exit = L->getUniqueExitBlock(); 113234285Sdim assert(Exit != 0 && "Loop must have a single exit block only"); 114234285Sdim // Split the exit to maintain loop canonicalization guarantees 115234285Sdim SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); 116234285Sdim if (!Exit->isLandingPad()) { 117234285Sdim SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P); 118234285Sdim } else { 119234285Sdim SmallVector<BasicBlock*, 2> NewBBs; 120234285Sdim SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa", 121234285Sdim P, NewBBs); 122234285Sdim } 123234285Sdim // Add the branch to the exit block (around the unrolled loop) 124234285Sdim BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt); 125234285Sdim InsertPt->eraseFromParent(); 126234285Sdim} 127234285Sdim 128234285Sdim/// Create a clone of the blocks in a loop and connect them together. 129234285Sdim/// This function doesn't create a clone of the loop structure. 130234285Sdim/// 131234285Sdim/// There are two value maps that are defined and used. VMap is 132234285Sdim/// for the values in the current loop instance. LVMap contains 133234285Sdim/// the values from the last loop instance. We need the LVMap values 134239462Sdim/// to update the initial values for the current loop instance. 135234285Sdim/// 136234285Sdimstatic void CloneLoopBlocks(Loop *L, 137234285Sdim bool FirstCopy, 138234285Sdim BasicBlock *InsertTop, 139234285Sdim BasicBlock *InsertBot, 140234285Sdim std::vector<BasicBlock *> &NewBlocks, 141234285Sdim LoopBlocksDFS &LoopBlocks, 142234285Sdim ValueToValueMapTy &VMap, 143234285Sdim ValueToValueMapTy &LVMap, 144234285Sdim LoopInfo *LI) { 145234285Sdim 146234285Sdim BasicBlock *Preheader = L->getLoopPreheader(); 147234285Sdim BasicBlock *Header = L->getHeader(); 148234285Sdim BasicBlock *Latch = L->getLoopLatch(); 149234285Sdim Function *F = Header->getParent(); 150234285Sdim LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); 151234285Sdim LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); 152234285Sdim // For each block in the original loop, create a new copy, 153234285Sdim // and update the value map with the newly created values. 154234285Sdim for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 155234285Sdim BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F); 156234285Sdim NewBlocks.push_back(NewBB); 157234285Sdim 158234285Sdim if (Loop *ParentLoop = L->getParentLoop()) 159234285Sdim ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); 160234285Sdim 161234285Sdim VMap[*BB] = NewBB; 162234285Sdim if (Header == *BB) { 163234285Sdim // For the first block, add a CFG connection to this newly 164234285Sdim // created block 165234285Sdim InsertTop->getTerminator()->setSuccessor(0, NewBB); 166234285Sdim 167234285Sdim // Change the incoming values to the ones defined in the 168234285Sdim // previously cloned loop. 169234285Sdim for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 170234285Sdim PHINode *NewPHI = cast<PHINode>(VMap[I]); 171234285Sdim if (FirstCopy) { 172234285Sdim // We replace the first phi node with the value from the preheader 173234285Sdim VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); 174234285Sdim NewBB->getInstList().erase(NewPHI); 175234285Sdim } else { 176234285Sdim // Update VMap with values from the previous block 177234285Sdim unsigned idx = NewPHI->getBasicBlockIndex(Latch); 178234285Sdim Value *InVal = NewPHI->getIncomingValue(idx); 179234285Sdim if (Instruction *I = dyn_cast<Instruction>(InVal)) 180234285Sdim if (L->contains(I)) 181234285Sdim InVal = LVMap[InVal]; 182234285Sdim NewPHI->setIncomingValue(idx, InVal); 183234285Sdim NewPHI->setIncomingBlock(idx, InsertTop); 184234285Sdim } 185234285Sdim } 186234285Sdim } 187234285Sdim 188234285Sdim if (Latch == *BB) { 189234285Sdim VMap.erase((*BB)->getTerminator()); 190234285Sdim NewBB->getTerminator()->eraseFromParent(); 191234285Sdim BranchInst::Create(InsertBot, NewBB); 192234285Sdim } 193234285Sdim } 194234285Sdim // LastValueMap is updated with the values for the current loop 195234285Sdim // which are used the next time this function is called. 196234285Sdim for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 197234285Sdim VI != VE; ++VI) { 198234285Sdim LVMap[VI->first] = VI->second; 199234285Sdim } 200234285Sdim} 201234285Sdim 202234285Sdim/// Insert code in the prolog code when unrolling a loop with a 203234285Sdim/// run-time trip-count. 204234285Sdim/// 205234285Sdim/// This method assumes that the loop unroll factor is total number 206234285Sdim/// of loop bodes in the loop after unrolling. (Some folks refer 207234285Sdim/// to the unroll factor as the number of *extra* copies added). 208234285Sdim/// We assume also that the loop unroll factor is a power-of-two. So, after 209234285Sdim/// unrolling the loop, the number of loop bodies executed is 2, 210234285Sdim/// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch 211234285Sdim/// instruction in SimplifyCFG.cpp. Then, the backend decides how code for 212234285Sdim/// the switch instruction is generated. 213234285Sdim/// 214234285Sdim/// extraiters = tripcount % loopfactor 215234285Sdim/// if (extraiters == 0) jump Loop: 216234285Sdim/// if (extraiters == loopfactor) jump L1 217234285Sdim/// if (extraiters == loopfactor-1) jump L2 218234285Sdim/// ... 219234285Sdim/// L1: LoopBody; 220234285Sdim/// L2: LoopBody; 221234285Sdim/// ... 222234285Sdim/// if tripcount < loopfactor jump End 223234285Sdim/// Loop: 224234285Sdim/// ... 225234285Sdim/// End: 226234285Sdim/// 227234285Sdimbool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, 228234285Sdim LPPassManager *LPM) { 229234285Sdim // for now, only unroll loops that contain a single exit 230234285Sdim if (!L->getExitingBlock()) 231234285Sdim return false; 232234285Sdim 233234285Sdim // Make sure the loop is in canonical form, and there is a single 234234285Sdim // exit block only. 235234285Sdim if (!L->isLoopSimplifyForm() || L->getUniqueExitBlock() == 0) 236234285Sdim return false; 237234285Sdim 238234285Sdim // Use Scalar Evolution to compute the trip count. This allows more 239234285Sdim // loops to be unrolled than relying on induction var simplification 240239462Sdim if (!LPM) 241239462Sdim return false; 242234285Sdim ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); 243234285Sdim if (SE == 0) 244234285Sdim return false; 245234285Sdim 246234285Sdim // Only unroll loops with a computable trip count and the trip count needs 247234285Sdim // to be an int value (allowing a pointer type is a TODO item) 248234285Sdim const SCEV *BECount = SE->getBackedgeTakenCount(L); 249234285Sdim if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy()) 250234285Sdim return false; 251234285Sdim 252234285Sdim // Add 1 since the backedge count doesn't include the first loop iteration 253234285Sdim const SCEV *TripCountSC = 254234285Sdim SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1)); 255234285Sdim if (isa<SCEVCouldNotCompute>(TripCountSC)) 256234285Sdim return false; 257234285Sdim 258234285Sdim // We only handle cases when the unroll factor is a power of 2. 259234285Sdim // Count is the loop unroll factor, the number of extra copies added + 1. 260234285Sdim if ((Count & (Count-1)) != 0) 261234285Sdim return false; 262234285Sdim 263234285Sdim // If this loop is nested, then the loop unroller changes the code in 264234285Sdim // parent loop, so the Scalar Evolution pass needs to be run again 265234285Sdim if (Loop *ParentLoop = L->getParentLoop()) 266234285Sdim SE->forgetLoop(ParentLoop); 267234285Sdim 268234285Sdim BasicBlock *PH = L->getLoopPreheader(); 269234285Sdim BasicBlock *Header = L->getHeader(); 270234285Sdim BasicBlock *Latch = L->getLoopLatch(); 271234285Sdim // It helps to splits the original preheader twice, one for the end of the 272234285Sdim // prolog code and one for a new loop preheader 273234285Sdim BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass()); 274234285Sdim BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass()); 275234285Sdim BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); 276234285Sdim 277234285Sdim // Compute the number of extra iterations required, which is: 278234285Sdim // extra iterations = run-time trip count % (loop unroll factor + 1) 279234285Sdim SCEVExpander Expander(*SE, "loop-unroll"); 280234285Sdim Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), 281234285Sdim PreHeaderBR); 282234285Sdim Type *CountTy = TripCount->getType(); 283234285Sdim BinaryOperator *ModVal = 284234285Sdim BinaryOperator::CreateURem(TripCount, 285234285Sdim ConstantInt::get(CountTy, Count), 286234285Sdim "xtraiter"); 287234285Sdim ModVal->insertBefore(PreHeaderBR); 288234285Sdim 289234285Sdim // Check if for no extra iterations, then jump to unrolled loop 290234285Sdim Value *BranchVal = new ICmpInst(PreHeaderBR, 291234285Sdim ICmpInst::ICMP_NE, ModVal, 292234285Sdim ConstantInt::get(CountTy, 0), "lcmp"); 293234285Sdim // Branch to either the extra iterations or the unrolled loop 294234285Sdim // We will fix up the true branch label when adding loop body copies 295234285Sdim BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); 296234285Sdim assert(PreHeaderBR->isUnconditional() && 297234285Sdim PreHeaderBR->getSuccessor(0) == PEnd && 298234285Sdim "CFG edges in Preheader are not correct"); 299234285Sdim PreHeaderBR->eraseFromParent(); 300234285Sdim 301234285Sdim ValueToValueMapTy LVMap; 302234285Sdim Function *F = Header->getParent(); 303234285Sdim // These variables are used to update the CFG links in each iteration 304234285Sdim BasicBlock *CompareBB = 0; 305234285Sdim BasicBlock *LastLoopBB = PH; 306234285Sdim // Get an ordered list of blocks in the loop to help with the ordering of the 307234285Sdim // cloned blocks in the prolog code 308234285Sdim LoopBlocksDFS LoopBlocks(L); 309234285Sdim LoopBlocks.perform(LI); 310234285Sdim 311234285Sdim // 312234285Sdim // For each extra loop iteration, create a copy of the loop's basic blocks 313234285Sdim // and generate a condition that branches to the copy depending on the 314234285Sdim // number of 'left over' iterations. 315234285Sdim // 316234285Sdim for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) { 317234285Sdim std::vector<BasicBlock*> NewBlocks; 318234285Sdim ValueToValueMapTy VMap; 319234285Sdim 320234285Sdim // Clone all the basic blocks in the loop, but we don't clone the loop 321234285Sdim // This function adds the appropriate CFG connections. 322234285Sdim CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks, 323234285Sdim LoopBlocks, VMap, LVMap, LI); 324234285Sdim LastLoopBB = cast<BasicBlock>(VMap[Latch]); 325234285Sdim 326234285Sdim // Insert the cloned blocks into function just before the original loop 327234285Sdim F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), 328234285Sdim NewBlocks[0], F->end()); 329234285Sdim 330234285Sdim // Generate the code for the comparison which determines if the loop 331234285Sdim // prolog code needs to be executed. 332234285Sdim if (leftOverIters == Count-1) { 333234285Sdim // There is no compare block for the fall-thru case when for the last 334234285Sdim // left over iteration 335234285Sdim CompareBB = NewBlocks[0]; 336234285Sdim } else { 337234285Sdim // Create a new block for the comparison 338234285Sdim BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp", 339234285Sdim F, CompareBB); 340234285Sdim if (Loop *ParentLoop = L->getParentLoop()) { 341234285Sdim // Add the new block to the parent loop, if needed 342234285Sdim ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); 343234285Sdim } 344234285Sdim 345234285Sdim // The comparison w/ the extra iteration value and branch 346234285Sdim Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, 347234285Sdim ConstantInt::get(CountTy, leftOverIters), 348234285Sdim "un.tmp"); 349234285Sdim // Branch to either the extra iterations or the unrolled loop 350234285Sdim BranchInst::Create(NewBlocks[0], CompareBB, 351234285Sdim BranchVal, NewBB); 352234285Sdim CompareBB = NewBB; 353234285Sdim PH->getTerminator()->setSuccessor(0, NewBB); 354234285Sdim VMap[NewPH] = CompareBB; 355234285Sdim } 356234285Sdim 357234285Sdim // Rewrite the cloned instruction operands to use the values 358234285Sdim // created when the clone is created. 359234285Sdim for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { 360234285Sdim for (BasicBlock::iterator I = NewBlocks[i]->begin(), 361234285Sdim E = NewBlocks[i]->end(); I != E; ++I) { 362234285Sdim RemapInstruction(I, VMap, 363234285Sdim RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); 364234285Sdim } 365234285Sdim } 366234285Sdim } 367234285Sdim 368234285Sdim // Connect the prolog code to the original loop and update the 369234285Sdim // PHI functions. 370234285Sdim ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap, 371234285Sdim LPM->getAsPass()); 372234285Sdim NumRuntimeUnrolled++; 373234285Sdim return true; 374234285Sdim} 375