1327952Sdim//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===// 2259698Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6259698Sdim// 7259698Sdim//===----------------------------------------------------------------------===// 8259698Sdim// 9259698Sdim// This pass implements a simple loop reroller. 10259698Sdim// 11259698Sdim//===----------------------------------------------------------------------===// 12259698Sdim 13327952Sdim#include "llvm/ADT/APInt.h" 14321369Sdim#include "llvm/ADT/BitVector.h" 15327952Sdim#include "llvm/ADT/DenseMap.h" 16327952Sdim#include "llvm/ADT/DenseSet.h" 17288943Sdim#include "llvm/ADT/MapVector.h" 18276479Sdim#include "llvm/ADT/STLExtras.h" 19341825Sdim#include "llvm/ADT/SmallPtrSet.h" 20327952Sdim#include "llvm/ADT/SmallVector.h" 21259698Sdim#include "llvm/ADT/Statistic.h" 22259698Sdim#include "llvm/Analysis/AliasAnalysis.h" 23259698Sdim#include "llvm/Analysis/AliasSetTracker.h" 24327952Sdim#include "llvm/Analysis/LoopInfo.h" 25259698Sdim#include "llvm/Analysis/LoopPass.h" 26259698Sdim#include "llvm/Analysis/ScalarEvolution.h" 27259698Sdim#include "llvm/Analysis/ScalarEvolutionExpander.h" 28259698Sdim#include "llvm/Analysis/ScalarEvolutionExpressions.h" 29288943Sdim#include "llvm/Analysis/TargetLibraryInfo.h" 30259698Sdim#include "llvm/Analysis/ValueTracking.h" 31327952Sdim#include "llvm/IR/BasicBlock.h" 32327952Sdim#include "llvm/IR/Constants.h" 33259698Sdim#include "llvm/IR/DataLayout.h" 34327952Sdim#include "llvm/IR/DerivedTypes.h" 35276479Sdim#include "llvm/IR/Dominators.h" 36327952Sdim#include "llvm/IR/IRBuilder.h" 37327952Sdim#include "llvm/IR/InstrTypes.h" 38327952Sdim#include "llvm/IR/Instruction.h" 39327952Sdim#include "llvm/IR/Instructions.h" 40259698Sdim#include "llvm/IR/IntrinsicInst.h" 41327952Sdim#include "llvm/IR/Intrinsics.h" 42327952Sdim#include "llvm/IR/Module.h" 43327952Sdim#include "llvm/IR/Type.h" 44327952Sdim#include "llvm/IR/Use.h" 45327952Sdim#include "llvm/IR/User.h" 46327952Sdim#include "llvm/IR/Value.h" 47360784Sdim#include "llvm/InitializePasses.h" 48327952Sdim#include "llvm/Pass.h" 49327952Sdim#include "llvm/Support/Casting.h" 50259698Sdim#include "llvm/Support/CommandLine.h" 51259698Sdim#include "llvm/Support/Debug.h" 52259698Sdim#include "llvm/Support/raw_ostream.h" 53321369Sdim#include "llvm/Transforms/Scalar.h" 54341825Sdim#include "llvm/Transforms/Utils.h" 55259698Sdim#include "llvm/Transforms/Utils/BasicBlockUtils.h" 56360784Sdim#include "llvm/Transforms/Utils/Local.h" 57259698Sdim#include "llvm/Transforms/Utils/LoopUtils.h" 58327952Sdim#include <cassert> 59327952Sdim#include <cstddef> 60327952Sdim#include <cstdint> 61327952Sdim#include <cstdlib> 62327952Sdim#include <iterator> 63327952Sdim#include <map> 64327952Sdim#include <utility> 65259698Sdim 66259698Sdimusing namespace llvm; 67259698Sdim 68276479Sdim#define DEBUG_TYPE "loop-reroll" 69276479Sdim 70259698SdimSTATISTIC(NumRerolledLoops, "Number of rerolled loops"); 71259698Sdim 72259698Sdimstatic cl::opt<unsigned> 73288943SdimNumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), 74288943Sdim cl::Hidden, 75288943Sdim cl::desc("The maximum number of failures to tolerate" 76288943Sdim " during fuzzy matching. (default: 400)")); 77288943Sdim 78259698Sdim// This loop re-rolling transformation aims to transform loops like this: 79259698Sdim// 80259698Sdim// int foo(int a); 81259698Sdim// void bar(int *x) { 82259698Sdim// for (int i = 0; i < 500; i += 3) { 83259698Sdim// foo(i); 84259698Sdim// foo(i+1); 85259698Sdim// foo(i+2); 86259698Sdim// } 87259698Sdim// } 88259698Sdim// 89259698Sdim// into a loop like this: 90259698Sdim// 91259698Sdim// void bar(int *x) { 92259698Sdim// for (int i = 0; i < 500; ++i) 93259698Sdim// foo(i); 94259698Sdim// } 95259698Sdim// 96259698Sdim// It does this by looking for loops that, besides the latch code, are composed 97259698Sdim// of isomorphic DAGs of instructions, with each DAG rooted at some increment 98259698Sdim// to the induction variable, and where each DAG is isomorphic to the DAG 99259698Sdim// rooted at the induction variable (excepting the sub-DAGs which root the 100259698Sdim// other induction-variable increments). In other words, we're looking for loop 101259698Sdim// bodies of the form: 102259698Sdim// 103259698Sdim// %iv = phi [ (preheader, ...), (body, %iv.next) ] 104259698Sdim// f(%iv) 105259698Sdim// %iv.1 = add %iv, 1 <-- a root increment 106259698Sdim// f(%iv.1) 107259698Sdim// %iv.2 = add %iv, 2 <-- a root increment 108259698Sdim// f(%iv.2) 109259698Sdim// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 110259698Sdim// f(%iv.scale_m_1) 111259698Sdim// ... 112259698Sdim// %iv.next = add %iv, scale 113259698Sdim// %cmp = icmp(%iv, ...) 114259698Sdim// br %cmp, header, exit 115259698Sdim// 116259698Sdim// where each f(i) is a set of instructions that, collectively, are a function 117259698Sdim// only of i (and other loop-invariant values). 118259698Sdim// 119259698Sdim// As a special case, we can also reroll loops like this: 120259698Sdim// 121259698Sdim// int foo(int); 122259698Sdim// void bar(int *x) { 123259698Sdim// for (int i = 0; i < 500; ++i) { 124259698Sdim// x[3*i] = foo(0); 125259698Sdim// x[3*i+1] = foo(0); 126259698Sdim// x[3*i+2] = foo(0); 127259698Sdim// } 128259698Sdim// } 129259698Sdim// 130259698Sdim// into this: 131259698Sdim// 132259698Sdim// void bar(int *x) { 133259698Sdim// for (int i = 0; i < 1500; ++i) 134259698Sdim// x[i] = foo(0); 135259698Sdim// } 136259698Sdim// 137259698Sdim// in which case, we're looking for inputs like this: 138259698Sdim// 139259698Sdim// %iv = phi [ (preheader, ...), (body, %iv.next) ] 140259698Sdim// %scaled.iv = mul %iv, scale 141259698Sdim// f(%scaled.iv) 142259698Sdim// %scaled.iv.1 = add %scaled.iv, 1 143259698Sdim// f(%scaled.iv.1) 144259698Sdim// %scaled.iv.2 = add %scaled.iv, 2 145259698Sdim// f(%scaled.iv.2) 146259698Sdim// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 147259698Sdim// f(%scaled.iv.scale_m_1) 148259698Sdim// ... 149259698Sdim// %iv.next = add %iv, 1 150259698Sdim// %cmp = icmp(%iv, ...) 151259698Sdim// br %cmp, header, exit 152259698Sdim 153259698Sdimnamespace { 154327952Sdim 155288943Sdim enum IterationLimits { 156309124Sdim /// The maximum number of iterations that we'll try and reroll. 157309124Sdim IL_MaxRerollIterations = 32, 158288943Sdim /// The bitvector index used by loop induction variables and other 159288943Sdim /// instructions that belong to all iterations. 160288943Sdim IL_All, 161288943Sdim IL_End 162288943Sdim }; 163288943Sdim 164259698Sdim class LoopReroll : public LoopPass { 165259698Sdim public: 166259698Sdim static char ID; // Pass ID, replacement for typeid 167327952Sdim 168259698Sdim LoopReroll() : LoopPass(ID) { 169259698Sdim initializeLoopRerollPass(*PassRegistry::getPassRegistry()); 170259698Sdim } 171259698Sdim 172276479Sdim bool runOnLoop(Loop *L, LPPassManager &LPM) override; 173259698Sdim 174276479Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 175288943Sdim AU.addRequired<TargetLibraryInfoWrapperPass>(); 176309124Sdim getLoopAnalysisUsage(AU); 177259698Sdim } 178259698Sdim 179288943Sdim protected: 180259698Sdim AliasAnalysis *AA; 181259698Sdim LoopInfo *LI; 182259698Sdim ScalarEvolution *SE; 183259698Sdim TargetLibraryInfo *TLI; 184259698Sdim DominatorTree *DT; 185296417Sdim bool PreserveLCSSA; 186259698Sdim 187327952Sdim using SmallInstructionVector = SmallVector<Instruction *, 16>; 188341825Sdim using SmallInstructionSet = SmallPtrSet<Instruction *, 16>; 189259698Sdim 190296417Sdim // Map between induction variable and its increment 191296417Sdim DenseMap<Instruction *, int64_t> IVToIncMap; 192327952Sdim 193309124Sdim // For loop with multiple induction variable, remember the one used only to 194309124Sdim // control the loop. 195309124Sdim Instruction *LoopControlIV; 196296417Sdim 197296417Sdim // A chain of isomorphic instructions, identified by a single-use PHI 198259698Sdim // representing a reduction. Only the last value may be used outside the 199259698Sdim // loop. 200259698Sdim struct SimpleLoopReduction { 201327952Sdim SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) { 202259698Sdim assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); 203259698Sdim add(L); 204259698Sdim } 205259698Sdim 206259698Sdim bool valid() const { 207259698Sdim return Valid; 208259698Sdim } 209259698Sdim 210259698Sdim Instruction *getPHI() const { 211259698Sdim assert(Valid && "Using invalid reduction"); 212259698Sdim return Instructions.front(); 213259698Sdim } 214259698Sdim 215259698Sdim Instruction *getReducedValue() const { 216259698Sdim assert(Valid && "Using invalid reduction"); 217259698Sdim return Instructions.back(); 218259698Sdim } 219259698Sdim 220259698Sdim Instruction *get(size_t i) const { 221259698Sdim assert(Valid && "Using invalid reduction"); 222259698Sdim return Instructions[i+1]; 223259698Sdim } 224259698Sdim 225259698Sdim Instruction *operator [] (size_t i) const { return get(i); } 226259698Sdim 227259698Sdim // The size, ignoring the initial PHI. 228259698Sdim size_t size() const { 229259698Sdim assert(Valid && "Using invalid reduction"); 230259698Sdim return Instructions.size()-1; 231259698Sdim } 232259698Sdim 233327952Sdim using iterator = SmallInstructionVector::iterator; 234327952Sdim using const_iterator = SmallInstructionVector::const_iterator; 235259698Sdim 236259698Sdim iterator begin() { 237259698Sdim assert(Valid && "Using invalid reduction"); 238276479Sdim return std::next(Instructions.begin()); 239259698Sdim } 240259698Sdim 241259698Sdim const_iterator begin() const { 242259698Sdim assert(Valid && "Using invalid reduction"); 243276479Sdim return std::next(Instructions.begin()); 244259698Sdim } 245259698Sdim 246259698Sdim iterator end() { return Instructions.end(); } 247259698Sdim const_iterator end() const { return Instructions.end(); } 248259698Sdim 249259698Sdim protected: 250327952Sdim bool Valid = false; 251259698Sdim SmallInstructionVector Instructions; 252259698Sdim 253259698Sdim void add(Loop *L); 254259698Sdim }; 255259698Sdim 256259698Sdim // The set of all reductions, and state tracking of possible reductions 257259698Sdim // during loop instruction processing. 258259698Sdim struct ReductionTracker { 259327952Sdim using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>; 260259698Sdim 261259698Sdim // Add a new possible reduction. 262280031Sdim void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); } 263259698Sdim 264259698Sdim // Setup to track possible reductions corresponding to the provided 265259698Sdim // rerolling scale. Only reductions with a number of non-PHI instructions 266259698Sdim // that is divisible by the scale are considered. Three instructions sets 267259698Sdim // are filled in: 268259698Sdim // - A set of all possible instructions in eligible reductions. 269259698Sdim // - A set of all PHIs in eligible reductions 270280031Sdim // - A set of all reduced values (last instructions) in eligible 271280031Sdim // reductions. 272259698Sdim void restrictToScale(uint64_t Scale, 273259698Sdim SmallInstructionSet &PossibleRedSet, 274259698Sdim SmallInstructionSet &PossibleRedPHISet, 275259698Sdim SmallInstructionSet &PossibleRedLastSet) { 276259698Sdim PossibleRedIdx.clear(); 277259698Sdim PossibleRedIter.clear(); 278259698Sdim Reds.clear(); 279259698Sdim 280259698Sdim for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) 281259698Sdim if (PossibleReds[i].size() % Scale == 0) { 282259698Sdim PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); 283259698Sdim PossibleRedPHISet.insert(PossibleReds[i].getPHI()); 284280031Sdim 285259698Sdim PossibleRedSet.insert(PossibleReds[i].getPHI()); 286259698Sdim PossibleRedIdx[PossibleReds[i].getPHI()] = i; 287280031Sdim for (Instruction *J : PossibleReds[i]) { 288280031Sdim PossibleRedSet.insert(J); 289280031Sdim PossibleRedIdx[J] = i; 290259698Sdim } 291259698Sdim } 292259698Sdim } 293259698Sdim 294259698Sdim // The functions below are used while processing the loop instructions. 295259698Sdim 296259698Sdim // Are the two instructions both from reductions, and furthermore, from 297259698Sdim // the same reduction? 298259698Sdim bool isPairInSame(Instruction *J1, Instruction *J2) { 299259698Sdim DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); 300259698Sdim if (J1I != PossibleRedIdx.end()) { 301259698Sdim DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); 302259698Sdim if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) 303259698Sdim return true; 304259698Sdim } 305259698Sdim 306259698Sdim return false; 307259698Sdim } 308259698Sdim 309259698Sdim // The two provided instructions, the first from the base iteration, and 310259698Sdim // the second from iteration i, form a matched pair. If these are part of 311259698Sdim // a reduction, record that fact. 312259698Sdim void recordPair(Instruction *J1, Instruction *J2, unsigned i) { 313259698Sdim if (PossibleRedIdx.count(J1)) { 314259698Sdim assert(PossibleRedIdx.count(J2) && 315259698Sdim "Recording reduction vs. non-reduction instruction?"); 316259698Sdim 317259698Sdim PossibleRedIter[J1] = 0; 318259698Sdim PossibleRedIter[J2] = i; 319259698Sdim 320259698Sdim int Idx = PossibleRedIdx[J1]; 321259698Sdim assert(Idx == PossibleRedIdx[J2] && 322259698Sdim "Recording pair from different reductions?"); 323259698Sdim Reds.insert(Idx); 324259698Sdim } 325259698Sdim } 326259698Sdim 327259698Sdim // The functions below can be called after we've finished processing all 328259698Sdim // instructions in the loop, and we know which reductions were selected. 329259698Sdim 330259698Sdim bool validateSelected(); 331259698Sdim void replaceSelected(); 332259698Sdim 333259698Sdim protected: 334259698Sdim // The vector of all possible reductions (for any scale). 335259698Sdim SmallReductionVector PossibleReds; 336259698Sdim 337259698Sdim DenseMap<Instruction *, int> PossibleRedIdx; 338259698Sdim DenseMap<Instruction *, int> PossibleRedIter; 339259698Sdim DenseSet<int> Reds; 340259698Sdim }; 341259698Sdim 342288943Sdim // A DAGRootSet models an induction variable being used in a rerollable 343288943Sdim // loop. For example, 344288943Sdim // 345288943Sdim // x[i*3+0] = y1 346288943Sdim // x[i*3+1] = y2 347288943Sdim // x[i*3+2] = y3 348288943Sdim // 349296417Sdim // Base instruction -> i*3 350288943Sdim // +---+----+ 351288943Sdim // / | \ 352288943Sdim // ST[y1] +1 +2 <-- Roots 353288943Sdim // | | 354288943Sdim // ST[y2] ST[y3] 355288943Sdim // 356288943Sdim // There may be multiple DAGRoots, for example: 357288943Sdim // 358288943Sdim // x[i*2+0] = ... (1) 359288943Sdim // x[i*2+1] = ... (1) 360288943Sdim // x[i*2+4] = ... (2) 361288943Sdim // x[i*2+5] = ... (2) 362288943Sdim // x[(i+1234)*2+5678] = ... (3) 363288943Sdim // x[(i+1234)*2+5679] = ... (3) 364288943Sdim // 365288943Sdim // The loop will be rerolled by adding a new loop induction variable, 366288943Sdim // one for the Base instruction in each DAGRootSet. 367288943Sdim // 368288943Sdim struct DAGRootSet { 369288943Sdim Instruction *BaseInst; 370288943Sdim SmallInstructionVector Roots; 371327952Sdim 372288943Sdim // The instructions between IV and BaseInst (but not including BaseInst). 373288943Sdim SmallInstructionSet SubsumedInsts; 374288943Sdim }; 375288943Sdim 376288943Sdim // The set of all DAG roots, and state tracking of all roots 377288943Sdim // for a particular induction variable. 378288943Sdim struct DAGRootTracker { 379288943Sdim DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, 380288943Sdim ScalarEvolution *SE, AliasAnalysis *AA, 381296417Sdim TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, 382296417Sdim bool PreserveLCSSA, 383309124Sdim DenseMap<Instruction *, int64_t> &IncrMap, 384309124Sdim Instruction *LoopCtrlIV) 385296417Sdim : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), 386309124Sdim PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap), 387309124Sdim LoopControlIV(LoopCtrlIV) {} 388288943Sdim 389288943Sdim /// Stage 1: Find all the DAG roots for the induction variable. 390288943Sdim bool findRoots(); 391327952Sdim 392288943Sdim /// Stage 2: Validate if the found roots are valid. 393288943Sdim bool validate(ReductionTracker &Reductions); 394327952Sdim 395288943Sdim /// Stage 3: Assuming validate() returned true, perform the 396288943Sdim /// replacement. 397341825Sdim /// @param BackedgeTakenCount The backedge-taken count of L. 398341825Sdim void replace(const SCEV *BackedgeTakenCount); 399288943Sdim 400288943Sdim protected: 401327952Sdim using UsesTy = MapVector<Instruction *, BitVector>; 402288943Sdim 403314564Sdim void findRootsRecursive(Instruction *IVU, 404288943Sdim SmallInstructionSet SubsumedInsts); 405288943Sdim bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); 406288943Sdim bool collectPossibleRoots(Instruction *Base, 407288943Sdim std::map<int64_t,Instruction*> &Roots); 408314564Sdim bool validateRootSet(DAGRootSet &DRS); 409288943Sdim 410288943Sdim bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); 411288943Sdim void collectInLoopUserSet(const SmallInstructionVector &Roots, 412288943Sdim const SmallInstructionSet &Exclude, 413288943Sdim const SmallInstructionSet &Final, 414288943Sdim DenseSet<Instruction *> &Users); 415288943Sdim void collectInLoopUserSet(Instruction *Root, 416288943Sdim const SmallInstructionSet &Exclude, 417288943Sdim const SmallInstructionSet &Final, 418288943Sdim DenseSet<Instruction *> &Users); 419288943Sdim 420288943Sdim UsesTy::iterator nextInstr(int Val, UsesTy &In, 421288943Sdim const SmallInstructionSet &Exclude, 422288943Sdim UsesTy::iterator *StartI=nullptr); 423288943Sdim bool isBaseInst(Instruction *I); 424288943Sdim bool isRootInst(Instruction *I); 425288943Sdim bool instrDependsOn(Instruction *I, 426288943Sdim UsesTy::iterator Start, 427288943Sdim UsesTy::iterator End); 428341825Sdim void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr); 429288943Sdim 430288943Sdim LoopReroll *Parent; 431288943Sdim 432288943Sdim // Members of Parent, replicated here for brevity. 433288943Sdim Loop *L; 434288943Sdim ScalarEvolution *SE; 435288943Sdim AliasAnalysis *AA; 436288943Sdim TargetLibraryInfo *TLI; 437296417Sdim DominatorTree *DT; 438296417Sdim LoopInfo *LI; 439296417Sdim bool PreserveLCSSA; 440288943Sdim 441288943Sdim // The loop induction variable. 442288943Sdim Instruction *IV; 443327952Sdim 444288943Sdim // Loop step amount. 445296417Sdim int64_t Inc; 446327952Sdim 447288943Sdim // Loop reroll count; if Inc == 1, this records the scaling applied 448288943Sdim // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; 449288943Sdim // If Inc is not 1, Scale = Inc. 450288943Sdim uint64_t Scale; 451327952Sdim 452288943Sdim // The roots themselves. 453288943Sdim SmallVector<DAGRootSet,16> RootSets; 454327952Sdim 455288943Sdim // All increment instructions for IV. 456288943Sdim SmallInstructionVector LoopIncs; 457327952Sdim 458288943Sdim // Map of all instructions in the loop (in order) to the iterations 459288943Sdim // they are used in (or specially, IL_All for instructions 460288943Sdim // used in the loop increment mechanism). 461288943Sdim UsesTy Uses; 462327952Sdim 463296417Sdim // Map between induction variable and its increment 464296417Sdim DenseMap<Instruction *, int64_t> &IVToIncMap; 465327952Sdim 466309124Sdim Instruction *LoopControlIV; 467288943Sdim }; 468288943Sdim 469309124Sdim // Check if it is a compare-like instruction whose user is a branch 470309124Sdim bool isCompareUsedByBranch(Instruction *I) { 471309124Sdim auto *TI = I->getParent()->getTerminator(); 472309124Sdim if (!isa<BranchInst>(TI) || !isa<CmpInst>(I)) 473309124Sdim return false; 474309124Sdim return I->hasOneUse() && TI->getOperand(0) == I; 475309124Sdim }; 476309124Sdim 477309124Sdim bool isLoopControlIV(Loop *L, Instruction *IV); 478259698Sdim void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); 479259698Sdim void collectPossibleReductions(Loop *L, 480259698Sdim ReductionTracker &Reductions); 481341825Sdim bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, 482341825Sdim const SCEV *BackedgeTakenCount, ReductionTracker &Reductions); 483259698Sdim }; 484259698Sdim 485327952Sdim} // end anonymous namespace 486327952Sdim 487259698Sdimchar LoopReroll::ID = 0; 488327952Sdim 489259698SdimINITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) 490309124SdimINITIALIZE_PASS_DEPENDENCY(LoopPass) 491288943SdimINITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 492259698SdimINITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) 493259698Sdim 494259698SdimPass *llvm::createLoopRerollPass() { 495259698Sdim return new LoopReroll; 496259698Sdim} 497259698Sdim 498259698Sdim// Returns true if the provided instruction is used outside the given loop. 499259698Sdim// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in 500259698Sdim// non-loop blocks to be outside the loop. 501259698Sdimstatic bool hasUsesOutsideLoop(Instruction *I, Loop *L) { 502288943Sdim for (User *U : I->users()) { 503276479Sdim if (!L->contains(cast<Instruction>(U))) 504259698Sdim return true; 505288943Sdim } 506259698Sdim return false; 507259698Sdim} 508259698Sdim 509309124Sdim// Check if an IV is only used to control the loop. There are two cases: 510309124Sdim// 1. It only has one use which is loop increment, and the increment is only 511309124Sdim// used by comparison and the PHI (could has sext with nsw in between), and the 512309124Sdim// comparison is only used by branch. 513309124Sdim// 2. It is used by loop increment and the comparison, the loop increment is 514309124Sdim// only used by the PHI, and the comparison is used only by the branch. 515309124Sdimbool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { 516309124Sdim unsigned IVUses = IV->getNumUses(); 517309124Sdim if (IVUses != 2 && IVUses != 1) 518309124Sdim return false; 519309124Sdim 520309124Sdim for (auto *User : IV->users()) { 521309124Sdim int32_t IncOrCmpUses = User->getNumUses(); 522309124Sdim bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User)); 523309124Sdim 524309124Sdim // User can only have one or two uses. 525309124Sdim if (IncOrCmpUses != 2 && IncOrCmpUses != 1) 526309124Sdim return false; 527309124Sdim 528309124Sdim // Case 1 529309124Sdim if (IVUses == 1) { 530309124Sdim // The only user must be the loop increment. 531309124Sdim // The loop increment must have two uses. 532309124Sdim if (IsCompInst || IncOrCmpUses != 2) 533309124Sdim return false; 534309124Sdim } 535309124Sdim 536309124Sdim // Case 2 537309124Sdim if (IVUses == 2 && IncOrCmpUses != 1) 538309124Sdim return false; 539309124Sdim 540309124Sdim // The users of the IV must be a binary operation or a comparison 541309124Sdim if (auto *BO = dyn_cast<BinaryOperator>(User)) { 542309124Sdim if (BO->getOpcode() == Instruction::Add) { 543309124Sdim // Loop Increment 544309124Sdim // User of Loop Increment should be either PHI or CMP 545309124Sdim for (auto *UU : User->users()) { 546309124Sdim if (PHINode *PN = dyn_cast<PHINode>(UU)) { 547309124Sdim if (PN != IV) 548309124Sdim return false; 549309124Sdim } 550309124Sdim // Must be a CMP or an ext (of a value with nsw) then CMP 551309124Sdim else { 552309124Sdim Instruction *UUser = dyn_cast<Instruction>(UU); 553309124Sdim // Skip SExt if we are extending an nsw value 554309124Sdim // TODO: Allow ZExt too 555321369Sdim if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() && 556309124Sdim isa<SExtInst>(UUser)) 557309124Sdim UUser = dyn_cast<Instruction>(*(UUser->user_begin())); 558309124Sdim if (!isCompareUsedByBranch(UUser)) 559309124Sdim return false; 560309124Sdim } 561309124Sdim } 562309124Sdim } else 563309124Sdim return false; 564309124Sdim // Compare : can only have one use, and must be branch 565309124Sdim } else if (!IsCompInst) 566309124Sdim return false; 567309124Sdim } 568309124Sdim return true; 569309124Sdim} 570309124Sdim 571259698Sdim// Collect the list of loop induction variables with respect to which it might 572259698Sdim// be possible to reroll the loop. 573259698Sdimvoid LoopReroll::collectPossibleIVs(Loop *L, 574259698Sdim SmallInstructionVector &PossibleIVs) { 575259698Sdim BasicBlock *Header = L->getHeader(); 576259698Sdim for (BasicBlock::iterator I = Header->begin(), 577259698Sdim IE = Header->getFirstInsertionPt(); I != IE; ++I) { 578259698Sdim if (!isa<PHINode>(I)) 579259698Sdim continue; 580309124Sdim if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy()) 581259698Sdim continue; 582259698Sdim 583259698Sdim if (const SCEVAddRecExpr *PHISCEV = 584296417Sdim dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) { 585259698Sdim if (PHISCEV->getLoop() != L) 586259698Sdim continue; 587259698Sdim if (!PHISCEV->isAffine()) 588259698Sdim continue; 589341825Sdim auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE)); 590309124Sdim if (IncSCEV) { 591296417Sdim IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); 592341825Sdim LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV 593341825Sdim << "\n"); 594309124Sdim 595309124Sdim if (isLoopControlIV(L, &*I)) { 596309124Sdim assert(!LoopControlIV && "Found two loop control only IV"); 597309124Sdim LoopControlIV = &(*I); 598341825Sdim LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I 599341825Sdim << " = " << *PHISCEV << "\n"); 600309124Sdim } else 601309124Sdim PossibleIVs.push_back(&*I); 602259698Sdim } 603259698Sdim } 604259698Sdim } 605259698Sdim} 606259698Sdim 607259698Sdim// Add the remainder of the reduction-variable chain to the instruction vector 608259698Sdim// (the initial PHINode has already been added). If successful, the object is 609259698Sdim// marked as valid. 610259698Sdimvoid LoopReroll::SimpleLoopReduction::add(Loop *L) { 611259698Sdim assert(!Valid && "Cannot add to an already-valid chain"); 612259698Sdim 613259698Sdim // The reduction variable must be a chain of single-use instructions 614259698Sdim // (including the PHI), except for the last value (which is used by the PHI 615259698Sdim // and also outside the loop). 616259698Sdim Instruction *C = Instructions.front(); 617288943Sdim if (C->user_empty()) 618288943Sdim return; 619259698Sdim 620259698Sdim do { 621276479Sdim C = cast<Instruction>(*C->user_begin()); 622259698Sdim if (C->hasOneUse()) { 623259698Sdim if (!C->isBinaryOp()) 624259698Sdim return; 625259698Sdim 626259698Sdim if (!(isa<PHINode>(Instructions.back()) || 627259698Sdim C->isSameOperationAs(Instructions.back()))) 628259698Sdim return; 629259698Sdim 630259698Sdim Instructions.push_back(C); 631259698Sdim } 632259698Sdim } while (C->hasOneUse()); 633259698Sdim 634259698Sdim if (Instructions.size() < 2 || 635259698Sdim !C->isSameOperationAs(Instructions.back()) || 636276479Sdim C->use_empty()) 637259698Sdim return; 638259698Sdim 639259698Sdim // C is now the (potential) last instruction in the reduction chain. 640288943Sdim for (User *U : C->users()) { 641259698Sdim // The only in-loop user can be the initial PHI. 642276479Sdim if (L->contains(cast<Instruction>(U))) 643276479Sdim if (cast<Instruction>(U) != Instructions.front()) 644259698Sdim return; 645288943Sdim } 646259698Sdim 647259698Sdim Instructions.push_back(C); 648259698Sdim Valid = true; 649259698Sdim} 650259698Sdim 651259698Sdim// Collect the vector of possible reduction variables. 652259698Sdimvoid LoopReroll::collectPossibleReductions(Loop *L, 653259698Sdim ReductionTracker &Reductions) { 654259698Sdim BasicBlock *Header = L->getHeader(); 655259698Sdim for (BasicBlock::iterator I = Header->begin(), 656259698Sdim IE = Header->getFirstInsertionPt(); I != IE; ++I) { 657259698Sdim if (!isa<PHINode>(I)) 658259698Sdim continue; 659259698Sdim if (!I->getType()->isSingleValueType()) 660259698Sdim continue; 661259698Sdim 662296417Sdim SimpleLoopReduction SLR(&*I, L); 663259698Sdim if (!SLR.valid()) 664259698Sdim continue; 665259698Sdim 666341825Sdim LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " 667341825Sdim << SLR.size() << " chained instructions)\n"); 668259698Sdim Reductions.addSLR(SLR); 669259698Sdim } 670259698Sdim} 671259698Sdim 672259698Sdim// Collect the set of all users of the provided root instruction. This set of 673259698Sdim// users contains not only the direct users of the root instruction, but also 674259698Sdim// all users of those users, and so on. There are two exceptions: 675259698Sdim// 676259698Sdim// 1. Instructions in the set of excluded instructions are never added to the 677259698Sdim// use set (even if they are users). This is used, for example, to exclude 678259698Sdim// including root increments in the use set of the primary IV. 679259698Sdim// 680259698Sdim// 2. Instructions in the set of final instructions are added to the use set 681259698Sdim// if they are users, but their users are not added. This is used, for 682259698Sdim// example, to prevent a reduction update from forcing all later reduction 683259698Sdim// updates into the use set. 684288943Sdimvoid LoopReroll::DAGRootTracker::collectInLoopUserSet( 685259698Sdim Instruction *Root, const SmallInstructionSet &Exclude, 686259698Sdim const SmallInstructionSet &Final, 687259698Sdim DenseSet<Instruction *> &Users) { 688259698Sdim SmallInstructionVector Queue(1, Root); 689259698Sdim while (!Queue.empty()) { 690259698Sdim Instruction *I = Queue.pop_back_val(); 691259698Sdim if (!Users.insert(I).second) 692259698Sdim continue; 693259698Sdim 694259698Sdim if (!Final.count(I)) 695276479Sdim for (Use &U : I->uses()) { 696276479Sdim Instruction *User = cast<Instruction>(U.getUser()); 697259698Sdim if (PHINode *PN = dyn_cast<PHINode>(User)) { 698259698Sdim // Ignore "wrap-around" uses to PHIs of this loop's header. 699276479Sdim if (PN->getIncomingBlock(U) == L->getHeader()) 700259698Sdim continue; 701259698Sdim } 702280031Sdim 703259698Sdim if (L->contains(User) && !Exclude.count(User)) { 704259698Sdim Queue.push_back(User); 705259698Sdim } 706259698Sdim } 707259698Sdim 708259698Sdim // We also want to collect single-user "feeder" values. 709259698Sdim for (User::op_iterator OI = I->op_begin(), 710259698Sdim OIE = I->op_end(); OI != OIE; ++OI) { 711259698Sdim if (Instruction *Op = dyn_cast<Instruction>(*OI)) 712259698Sdim if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && 713259698Sdim !Final.count(Op)) 714259698Sdim Queue.push_back(Op); 715259698Sdim } 716259698Sdim } 717259698Sdim} 718259698Sdim 719259698Sdim// Collect all of the users of all of the provided root instructions (combined 720259698Sdim// into a single set). 721288943Sdimvoid LoopReroll::DAGRootTracker::collectInLoopUserSet( 722259698Sdim const SmallInstructionVector &Roots, 723259698Sdim const SmallInstructionSet &Exclude, 724259698Sdim const SmallInstructionSet &Final, 725259698Sdim DenseSet<Instruction *> &Users) { 726309124Sdim for (Instruction *Root : Roots) 727309124Sdim collectInLoopUserSet(Root, Exclude, Final, Users); 728259698Sdim} 729259698Sdim 730314564Sdimstatic bool isUnorderedLoadStore(Instruction *I) { 731259698Sdim if (LoadInst *LI = dyn_cast<LoadInst>(I)) 732314564Sdim return LI->isUnordered(); 733259698Sdim if (StoreInst *SI = dyn_cast<StoreInst>(I)) 734314564Sdim return SI->isUnordered(); 735259698Sdim if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 736259698Sdim return !MI->isVolatile(); 737259698Sdim return false; 738259698Sdim} 739259698Sdim 740288943Sdim/// Return true if IVU is a "simple" arithmetic operation. 741288943Sdim/// This is used for narrowing the search space for DAGRoots; only arithmetic 742288943Sdim/// and GEPs can be part of a DAGRoot. 743288943Sdimstatic bool isSimpleArithmeticOp(User *IVU) { 744288943Sdim if (Instruction *I = dyn_cast<Instruction>(IVU)) { 745288943Sdim switch (I->getOpcode()) { 746288943Sdim default: return false; 747288943Sdim case Instruction::Add: 748288943Sdim case Instruction::Sub: 749288943Sdim case Instruction::Mul: 750288943Sdim case Instruction::Shl: 751288943Sdim case Instruction::AShr: 752288943Sdim case Instruction::LShr: 753288943Sdim case Instruction::GetElementPtr: 754288943Sdim case Instruction::Trunc: 755288943Sdim case Instruction::ZExt: 756288943Sdim case Instruction::SExt: 757288943Sdim return true; 758288943Sdim } 759288943Sdim } 760288943Sdim return false; 761288943Sdim} 762288943Sdim 763288943Sdimstatic bool isLoopIncrement(User *U, Instruction *IV) { 764288943Sdim BinaryOperator *BO = dyn_cast<BinaryOperator>(U); 765309124Sdim 766309124Sdim if ((BO && BO->getOpcode() != Instruction::Add) || 767309124Sdim (!BO && !isa<GetElementPtrInst>(U))) 768259698Sdim return false; 769259698Sdim 770309124Sdim for (auto *UU : U->users()) { 771288943Sdim PHINode *PN = dyn_cast<PHINode>(UU); 772288943Sdim if (PN && PN == IV) 773288943Sdim return true; 774259698Sdim } 775288943Sdim return false; 776288943Sdim} 777259698Sdim 778288943Sdimbool LoopReroll::DAGRootTracker:: 779288943SdimcollectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { 780288943Sdim SmallInstructionVector BaseUsers; 781288943Sdim 782288943Sdim for (auto *I : Base->users()) { 783288943Sdim ConstantInt *CI = nullptr; 784288943Sdim 785288943Sdim if (isLoopIncrement(I, IV)) { 786288943Sdim LoopIncs.push_back(cast<Instruction>(I)); 787288943Sdim continue; 788288943Sdim } 789288943Sdim 790288943Sdim // The root nodes must be either GEPs, ORs or ADDs. 791288943Sdim if (auto *BO = dyn_cast<BinaryOperator>(I)) { 792288943Sdim if (BO->getOpcode() == Instruction::Add || 793288943Sdim BO->getOpcode() == Instruction::Or) 794288943Sdim CI = dyn_cast<ConstantInt>(BO->getOperand(1)); 795288943Sdim } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 796288943Sdim Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); 797288943Sdim CI = dyn_cast<ConstantInt>(LastOperand); 798288943Sdim } 799288943Sdim 800288943Sdim if (!CI) { 801288943Sdim if (Instruction *II = dyn_cast<Instruction>(I)) { 802288943Sdim BaseUsers.push_back(II); 803288943Sdim continue; 804288943Sdim } else { 805341825Sdim LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I 806341825Sdim << "\n"); 807288943Sdim return false; 808288943Sdim } 809288943Sdim } 810288943Sdim 811296417Sdim int64_t V = std::abs(CI->getValue().getSExtValue()); 812288943Sdim if (Roots.find(V) != Roots.end()) 813288943Sdim // No duplicates, please. 814288943Sdim return false; 815288943Sdim 816288943Sdim Roots[V] = cast<Instruction>(I); 817288943Sdim } 818288943Sdim 819314564Sdim // Make sure we have at least two roots. 820314564Sdim if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty())) 821259698Sdim return false; 822259698Sdim 823288943Sdim // If we found non-loop-inc, non-root users of Base, assume they are 824288943Sdim // for the zeroth root index. This is because "add %a, 0" gets optimized 825288943Sdim // away. 826288943Sdim if (BaseUsers.size()) { 827288943Sdim if (Roots.find(0) != Roots.end()) { 828341825Sdim LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); 829259698Sdim return false; 830288943Sdim } 831288943Sdim Roots[0] = Base; 832288943Sdim } 833288943Sdim 834288943Sdim // Calculate the number of users of the base, or lowest indexed, iteration. 835288943Sdim unsigned NumBaseUses = BaseUsers.size(); 836288943Sdim if (NumBaseUses == 0) 837288943Sdim NumBaseUses = Roots.begin()->second->getNumUses(); 838296417Sdim 839288943Sdim // Check that every node has the same number of users. 840288943Sdim for (auto &KV : Roots) { 841288943Sdim if (KV.first == 0) 842288943Sdim continue; 843321369Sdim if (!KV.second->hasNUses(NumBaseUses)) { 844341825Sdim LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " 845341825Sdim << "#Base=" << NumBaseUses 846341825Sdim << ", #Root=" << KV.second->getNumUses() << "\n"); 847259698Sdim return false; 848288943Sdim } 849288943Sdim } 850259698Sdim 851296417Sdim return true; 852288943Sdim} 853288943Sdim 854314564Sdimvoid LoopReroll::DAGRootTracker:: 855288943SdimfindRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { 856288943Sdim // Does the user look like it could be part of a root set? 857288943Sdim // All its users must be simple arithmetic ops. 858321369Sdim if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1)) 859314564Sdim return; 860288943Sdim 861314564Sdim if (I != IV && findRootsBase(I, SubsumedInsts)) 862314564Sdim return; 863288943Sdim 864288943Sdim SubsumedInsts.insert(I); 865288943Sdim 866288943Sdim for (User *V : I->users()) { 867314564Sdim Instruction *I = cast<Instruction>(V); 868314564Sdim if (is_contained(LoopIncs, I)) 869288943Sdim continue; 870288943Sdim 871314564Sdim if (!isSimpleArithmeticOp(I)) 872314564Sdim continue; 873314564Sdim 874314564Sdim // The recursive call makes a copy of SubsumedInsts. 875314564Sdim findRootsRecursive(I, SubsumedInsts); 876288943Sdim } 877314564Sdim} 878314564Sdim 879314564Sdimbool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) { 880314564Sdim if (DRS.Roots.empty()) 881314564Sdim return false; 882314564Sdim 883314564Sdim // Consider a DAGRootSet with N-1 roots (so N different values including 884314564Sdim // BaseInst). 885314564Sdim // Define d = Roots[0] - BaseInst, which should be the same as 886314564Sdim // Roots[I] - Roots[I-1] for all I in [1..N). 887314564Sdim // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the 888314564Sdim // loop iteration J. 889314564Sdim // 890314564Sdim // Now, For the loop iterations to be consecutive: 891314564Sdim // D = d * N 892314564Sdim const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); 893314564Sdim if (!ADR) 894314564Sdim return false; 895353358Sdim 896353358Sdim // Check that the first root is evenly spaced. 897314564Sdim unsigned N = DRS.Roots.size() + 1; 898314564Sdim const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR); 899314564Sdim const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); 900314564Sdim if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) 901314564Sdim return false; 902314564Sdim 903353358Sdim // Check that the remainling roots are evenly spaced. 904353358Sdim for (unsigned i = 1; i < N - 1; ++i) { 905353358Sdim const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]), 906353358Sdim SE->getSCEV(DRS.Roots[i-1])); 907353358Sdim if (NewStepSCEV != StepSCEV) 908353358Sdim return false; 909353358Sdim } 910353358Sdim 911288943Sdim return true; 912288943Sdim} 913288943Sdim 914288943Sdimbool LoopReroll::DAGRootTracker:: 915288943SdimfindRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { 916314564Sdim // The base of a RootSet must be an AddRec, so it can be erased. 917314564Sdim const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU)); 918314564Sdim if (!IVU_ADR || IVU_ADR->getLoop() != L) 919259698Sdim return false; 920259698Sdim 921288943Sdim std::map<int64_t, Instruction*> V; 922288943Sdim if (!collectPossibleRoots(IVU, V)) 923288943Sdim return false; 924288943Sdim 925296417Sdim // If we didn't get a root for index zero, then IVU must be 926288943Sdim // subsumed. 927288943Sdim if (V.find(0) == V.end()) 928288943Sdim SubsumedInsts.insert(IVU); 929288943Sdim 930288943Sdim // Partition the vector into monotonically increasing indexes. 931288943Sdim DAGRootSet DRS; 932288943Sdim DRS.BaseInst = nullptr; 933288943Sdim 934314564Sdim SmallVector<DAGRootSet, 16> PotentialRootSets; 935314564Sdim 936288943Sdim for (auto &KV : V) { 937288943Sdim if (!DRS.BaseInst) { 938288943Sdim DRS.BaseInst = KV.second; 939288943Sdim DRS.SubsumedInsts = SubsumedInsts; 940288943Sdim } else if (DRS.Roots.empty()) { 941288943Sdim DRS.Roots.push_back(KV.second); 942288943Sdim } else if (V.find(KV.first - 1) != V.end()) { 943288943Sdim DRS.Roots.push_back(KV.second); 944288943Sdim } else { 945288943Sdim // Linear sequence terminated. 946314564Sdim if (!validateRootSet(DRS)) 947314564Sdim return false; 948314564Sdim 949314564Sdim // Construct a new DAGRootSet with the next sequence. 950314564Sdim PotentialRootSets.push_back(DRS); 951288943Sdim DRS.BaseInst = KV.second; 952288943Sdim DRS.Roots.clear(); 953288943Sdim } 954288943Sdim } 955288943Sdim 956314564Sdim if (!validateRootSet(DRS)) 957314564Sdim return false; 958314564Sdim 959314564Sdim PotentialRootSets.push_back(DRS); 960314564Sdim 961314564Sdim RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end()); 962314564Sdim 963259698Sdim return true; 964259698Sdim} 965259698Sdim 966288943Sdimbool LoopReroll::DAGRootTracker::findRoots() { 967296417Sdim Inc = IVToIncMap[IV]; 968259698Sdim 969288943Sdim assert(RootSets.empty() && "Unclean state!"); 970296417Sdim if (std::abs(Inc) == 1) { 971288943Sdim for (auto *IVU : IV->users()) { 972288943Sdim if (isLoopIncrement(IVU, IV)) 973288943Sdim LoopIncs.push_back(cast<Instruction>(IVU)); 974259698Sdim } 975314564Sdim findRootsRecursive(IV, SmallInstructionSet()); 976288943Sdim LoopIncs.push_back(IV); 977288943Sdim } else { 978288943Sdim if (!findRootsBase(IV, SmallInstructionSet())) 979288943Sdim return false; 980259698Sdim } 981259698Sdim 982288943Sdim // Ensure all sets have the same size. 983288943Sdim if (RootSets.empty()) { 984341825Sdim LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); 985259698Sdim return false; 986288943Sdim } 987288943Sdim for (auto &V : RootSets) { 988288943Sdim if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { 989341825Sdim LLVM_DEBUG( 990341825Sdim dbgs() 991341825Sdim << "LRR: Aborting because not all root sets have the same size\n"); 992288943Sdim return false; 993259698Sdim } 994288943Sdim } 995259698Sdim 996288943Sdim Scale = RootSets[0].Roots.size() + 1; 997288943Sdim 998288943Sdim if (Scale > IL_MaxRerollIterations) { 999341825Sdim LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " 1000341825Sdim << "#Found=" << Scale 1001341825Sdim << ", #Max=" << IL_MaxRerollIterations << "\n"); 1002259698Sdim return false; 1003288943Sdim } 1004259698Sdim 1005341825Sdim LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale 1006341825Sdim << "\n"); 1007288943Sdim 1008259698Sdim return true; 1009259698Sdim} 1010259698Sdim 1011288943Sdimbool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { 1012288943Sdim // Populate the MapVector with all instructions in the block, in order first, 1013288943Sdim // so we can iterate over the contents later in perfect order. 1014288943Sdim for (auto &I : *L->getHeader()) { 1015288943Sdim Uses[&I].resize(IL_End); 1016288943Sdim } 1017288943Sdim 1018288943Sdim SmallInstructionSet Exclude; 1019288943Sdim for (auto &DRS : RootSets) { 1020288943Sdim Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); 1021288943Sdim Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); 1022288943Sdim Exclude.insert(DRS.BaseInst); 1023288943Sdim } 1024288943Sdim Exclude.insert(LoopIncs.begin(), LoopIncs.end()); 1025288943Sdim 1026288943Sdim for (auto &DRS : RootSets) { 1027288943Sdim DenseSet<Instruction*> VBase; 1028288943Sdim collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); 1029288943Sdim for (auto *I : VBase) { 1030288943Sdim Uses[I].set(0); 1031288943Sdim } 1032288943Sdim 1033288943Sdim unsigned Idx = 1; 1034288943Sdim for (auto *Root : DRS.Roots) { 1035288943Sdim DenseSet<Instruction*> V; 1036288943Sdim collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); 1037288943Sdim 1038288943Sdim // While we're here, check the use sets are the same size. 1039288943Sdim if (V.size() != VBase.size()) { 1040341825Sdim LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); 1041259698Sdim return false; 1042259698Sdim } 1043259698Sdim 1044288943Sdim for (auto *I : V) { 1045288943Sdim Uses[I].set(Idx); 1046259698Sdim } 1047288943Sdim ++Idx; 1048288943Sdim } 1049259698Sdim 1050288943Sdim // Make sure our subsumed instructions are remembered too. 1051288943Sdim for (auto *I : DRS.SubsumedInsts) { 1052288943Sdim Uses[I].set(IL_All); 1053259698Sdim } 1054259698Sdim } 1055259698Sdim 1056288943Sdim // Make sure the loop increments are also accounted for. 1057259698Sdim 1058288943Sdim Exclude.clear(); 1059288943Sdim for (auto &DRS : RootSets) { 1060288943Sdim Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); 1061288943Sdim Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); 1062288943Sdim Exclude.insert(DRS.BaseInst); 1063288943Sdim } 1064259698Sdim 1065288943Sdim DenseSet<Instruction*> V; 1066288943Sdim collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); 1067288943Sdim for (auto *I : V) { 1068288943Sdim Uses[I].set(IL_All); 1069259698Sdim } 1070259698Sdim 1071288943Sdim return true; 1072288943Sdim} 1073259698Sdim 1074288943Sdim/// Get the next instruction in "In" that is a member of set Val. 1075288943Sdim/// Start searching from StartI, and do not return anything in Exclude. 1076288943Sdim/// If StartI is not given, start from In.begin(). 1077288943SdimLoopReroll::DAGRootTracker::UsesTy::iterator 1078288943SdimLoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, 1079288943Sdim const SmallInstructionSet &Exclude, 1080288943Sdim UsesTy::iterator *StartI) { 1081288943Sdim UsesTy::iterator I = StartI ? *StartI : In.begin(); 1082288943Sdim while (I != In.end() && (I->second.test(Val) == 0 || 1083288943Sdim Exclude.count(I->first) != 0)) 1084288943Sdim ++I; 1085288943Sdim return I; 1086288943Sdim} 1087259698Sdim 1088288943Sdimbool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { 1089288943Sdim for (auto &DRS : RootSets) { 1090288943Sdim if (DRS.BaseInst == I) 1091288943Sdim return true; 1092288943Sdim } 1093288943Sdim return false; 1094288943Sdim} 1095259698Sdim 1096288943Sdimbool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { 1097288943Sdim for (auto &DRS : RootSets) { 1098314564Sdim if (is_contained(DRS.Roots, I)) 1099288943Sdim return true; 1100288943Sdim } 1101288943Sdim return false; 1102288943Sdim} 1103259698Sdim 1104288943Sdim/// Return true if instruction I depends on any instruction between 1105288943Sdim/// Start and End. 1106288943Sdimbool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, 1107288943Sdim UsesTy::iterator Start, 1108288943Sdim UsesTy::iterator End) { 1109288943Sdim for (auto *U : I->users()) { 1110288943Sdim for (auto It = Start; It != End; ++It) 1111288943Sdim if (U == It->first) 1112288943Sdim return true; 1113288943Sdim } 1114288943Sdim return false; 1115288943Sdim} 1116259698Sdim 1117296417Sdimstatic bool isIgnorableInst(const Instruction *I) { 1118296417Sdim if (isa<DbgInfoIntrinsic>(I)) 1119296417Sdim return true; 1120296417Sdim const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); 1121296417Sdim if (!II) 1122296417Sdim return false; 1123296417Sdim switch (II->getIntrinsicID()) { 1124296417Sdim default: 1125296417Sdim return false; 1126327952Sdim case Intrinsic::annotation: 1127296417Sdim case Intrinsic::ptr_annotation: 1128296417Sdim case Intrinsic::var_annotation: 1129296417Sdim // TODO: the following intrinsics may also be whitelisted: 1130296417Sdim // lifetime_start, lifetime_end, invariant_start, invariant_end 1131296417Sdim return true; 1132296417Sdim } 1133296417Sdim return false; 1134296417Sdim} 1135296417Sdim 1136288943Sdimbool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { 1137259698Sdim // We now need to check for equivalence of the use graph of each root with 1138259698Sdim // that of the primary induction variable (excluding the roots). Our goal 1139259698Sdim // here is not to solve the full graph isomorphism problem, but rather to 1140259698Sdim // catch common cases without a lot of work. As a result, we will assume 1141259698Sdim // that the relative order of the instructions in each unrolled iteration 1142259698Sdim // is the same (although we will not make an assumption about how the 1143259698Sdim // different iterations are intermixed). Note that while the order must be 1144259698Sdim // the same, the instructions may not be in the same basic block. 1145259698Sdim 1146288943Sdim // An array of just the possible reductions for this scale factor. When we 1147288943Sdim // collect the set of all users of some root instructions, these reduction 1148288943Sdim // instructions are treated as 'final' (their uses are not considered). 1149288943Sdim // This is important because we don't want the root use set to search down 1150288943Sdim // the reduction chain. 1151288943Sdim SmallInstructionSet PossibleRedSet; 1152288943Sdim SmallInstructionSet PossibleRedLastSet; 1153288943Sdim SmallInstructionSet PossibleRedPHISet; 1154288943Sdim Reductions.restrictToScale(Scale, PossibleRedSet, 1155288943Sdim PossibleRedPHISet, PossibleRedLastSet); 1156259698Sdim 1157288943Sdim // Populate "Uses" with where each instruction is used. 1158288943Sdim if (!collectUsedInstructions(PossibleRedSet)) 1159288943Sdim return false; 1160259698Sdim 1161288943Sdim // Make sure we mark the reduction PHIs as used in all iterations. 1162288943Sdim for (auto *I : PossibleRedPHISet) { 1163288943Sdim Uses[I].set(IL_All); 1164288943Sdim } 1165259698Sdim 1166309124Sdim // Make sure we mark loop-control-only PHIs as used in all iterations. See 1167309124Sdim // comment above LoopReroll::isLoopControlIV for more information. 1168309124Sdim BasicBlock *Header = L->getHeader(); 1169309124Sdim if (LoopControlIV && LoopControlIV != IV) { 1170309124Sdim for (auto *U : LoopControlIV->users()) { 1171309124Sdim Instruction *IVUser = dyn_cast<Instruction>(U); 1172309124Sdim // IVUser could be loop increment or compare 1173309124Sdim Uses[IVUser].set(IL_All); 1174309124Sdim for (auto *UU : IVUser->users()) { 1175309124Sdim Instruction *UUser = dyn_cast<Instruction>(UU); 1176309124Sdim // UUser could be compare, PHI or branch 1177309124Sdim Uses[UUser].set(IL_All); 1178309124Sdim // Skip SExt 1179309124Sdim if (isa<SExtInst>(UUser)) { 1180309124Sdim UUser = dyn_cast<Instruction>(*(UUser->user_begin())); 1181309124Sdim Uses[UUser].set(IL_All); 1182309124Sdim } 1183309124Sdim // Is UUser a compare instruction? 1184309124Sdim if (UU->hasOneUse()) { 1185309124Sdim Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin()); 1186309124Sdim if (BI == cast<BranchInst>(Header->getTerminator())) 1187309124Sdim Uses[BI].set(IL_All); 1188309124Sdim } 1189309124Sdim } 1190309124Sdim } 1191309124Sdim } 1192309124Sdim 1193288943Sdim // Make sure all instructions in the loop are in one and only one 1194288943Sdim // set. 1195288943Sdim for (auto &KV : Uses) { 1196296417Sdim if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { 1197341825Sdim LLVM_DEBUG( 1198341825Sdim dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " 1199341825Sdim << *KV.first << " (#uses=" << KV.second.count() << ")\n"); 1200288943Sdim return false; 1201288943Sdim } 1202288943Sdim } 1203259698Sdim 1204341825Sdim LLVM_DEBUG(for (auto &KV 1205341825Sdim : Uses) { 1206341825Sdim dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; 1207341825Sdim }); 1208259698Sdim 1209288943Sdim for (unsigned Iter = 1; Iter < Scale; ++Iter) { 1210259698Sdim // In addition to regular aliasing information, we need to look for 1211259698Sdim // instructions from later (future) iterations that have side effects 1212259698Sdim // preventing us from reordering them past other instructions with side 1213259698Sdim // effects. 1214259698Sdim bool FutureSideEffects = false; 1215259698Sdim AliasSetTracker AST(*AA); 1216259698Sdim // The map between instructions in f(%iv.(i+1)) and f(%iv). 1217259698Sdim DenseMap<Value *, Value *> BaseMap; 1218259698Sdim 1219288943Sdim // Compare iteration Iter to the base. 1220288943Sdim SmallInstructionSet Visited; 1221288943Sdim auto BaseIt = nextInstr(0, Uses, Visited); 1222288943Sdim auto RootIt = nextInstr(Iter, Uses, Visited); 1223288943Sdim auto LastRootIt = Uses.begin(); 1224259698Sdim 1225288943Sdim while (BaseIt != Uses.end() && RootIt != Uses.end()) { 1226288943Sdim Instruction *BaseInst = BaseIt->first; 1227288943Sdim Instruction *RootInst = RootIt->first; 1228259698Sdim 1229288943Sdim // Skip over the IV or root instructions; only match their users. 1230288943Sdim bool Continue = false; 1231288943Sdim if (isBaseInst(BaseInst)) { 1232288943Sdim Visited.insert(BaseInst); 1233288943Sdim BaseIt = nextInstr(0, Uses, Visited); 1234288943Sdim Continue = true; 1235288943Sdim } 1236288943Sdim if (isRootInst(RootInst)) { 1237288943Sdim LastRootIt = RootIt; 1238288943Sdim Visited.insert(RootInst); 1239288943Sdim RootIt = nextInstr(Iter, Uses, Visited); 1240288943Sdim Continue = true; 1241288943Sdim } 1242288943Sdim if (Continue) continue; 1243288943Sdim 1244288943Sdim if (!BaseInst->isSameOperationAs(RootInst)) { 1245288943Sdim // Last chance saloon. We don't try and solve the full isomorphism 1246288943Sdim // problem, but try and at least catch the case where two instructions 1247288943Sdim // *of different types* are round the wrong way. We won't be able to 1248288943Sdim // efficiently tell, given two ADD instructions, which way around we 1249288943Sdim // should match them, but given an ADD and a SUB, we can at least infer 1250288943Sdim // which one is which. 1251288943Sdim // 1252288943Sdim // This should allow us to deal with a greater subset of the isomorphism 1253288943Sdim // problem. It does however change a linear algorithm into a quadratic 1254288943Sdim // one, so limit the number of probes we do. 1255288943Sdim auto TryIt = RootIt; 1256288943Sdim unsigned N = NumToleratedFailedMatches; 1257288943Sdim while (TryIt != Uses.end() && 1258288943Sdim !BaseInst->isSameOperationAs(TryIt->first) && 1259288943Sdim N--) { 1260288943Sdim ++TryIt; 1261288943Sdim TryIt = nextInstr(Iter, Uses, Visited, &TryIt); 1262259698Sdim } 1263259698Sdim 1264288943Sdim if (TryIt == Uses.end() || TryIt == RootIt || 1265288943Sdim instrDependsOn(TryIt->first, RootIt, TryIt)) { 1266341825Sdim LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " 1267341825Sdim << *BaseInst << " vs. " << *RootInst << "\n"); 1268288943Sdim return false; 1269288943Sdim } 1270296417Sdim 1271288943Sdim RootIt = TryIt; 1272288943Sdim RootInst = TryIt->first; 1273259698Sdim } 1274259698Sdim 1275288943Sdim // All instructions between the last root and this root 1276296417Sdim // may belong to some other iteration. If they belong to a 1277288943Sdim // future iteration, then they're dangerous to alias with. 1278296417Sdim // 1279288943Sdim // Note that because we allow a limited amount of flexibility in the order 1280288943Sdim // that we visit nodes, LastRootIt might be *before* RootIt, in which 1281288943Sdim // case we've already checked this set of instructions so we shouldn't 1282288943Sdim // do anything. 1283288943Sdim for (; LastRootIt < RootIt; ++LastRootIt) { 1284288943Sdim Instruction *I = LastRootIt->first; 1285288943Sdim if (LastRootIt->second.find_first() < (int)Iter) 1286288943Sdim continue; 1287288943Sdim if (I->mayWriteToMemory()) 1288288943Sdim AST.add(I); 1289288943Sdim // Note: This is specifically guarded by a check on isa<PHINode>, 1290288943Sdim // which while a valid (somewhat arbitrary) micro-optimization, is 1291288943Sdim // needed because otherwise isSafeToSpeculativelyExecute returns 1292288943Sdim // false on PHI nodes. 1293314564Sdim if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) && 1294288943Sdim !isSafeToSpeculativelyExecute(I)) 1295288943Sdim // Intervening instructions cause side effects. 1296288943Sdim FutureSideEffects = true; 1297259698Sdim } 1298259698Sdim 1299259698Sdim // Make sure that this instruction, which is in the use set of this 1300259698Sdim // root instruction, does not also belong to the base set or the set of 1301288943Sdim // some other root instruction. 1302288943Sdim if (RootIt->second.count() > 1) { 1303341825Sdim LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst 1304341825Sdim << " vs. " << *RootInst << " (prev. case overlap)\n"); 1305288943Sdim return false; 1306259698Sdim } 1307259698Sdim 1308259698Sdim // Make sure that we don't alias with any instruction in the alias set 1309259698Sdim // tracker. If we do, then we depend on a future iteration, and we 1310259698Sdim // can't reroll. 1311288943Sdim if (RootInst->mayReadFromMemory()) 1312288943Sdim for (auto &K : AST) { 1313288943Sdim if (K.aliasesUnknownInst(RootInst, *AA)) { 1314341825Sdim LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " 1315341825Sdim << *BaseInst << " vs. " << *RootInst 1316341825Sdim << " (depends on future store)\n"); 1317288943Sdim return false; 1318259698Sdim } 1319259698Sdim } 1320259698Sdim 1321259698Sdim // If we've past an instruction from a future iteration that may have 1322259698Sdim // side effects, and this instruction might also, then we can't reorder 1323259698Sdim // them, and this matching fails. As an exception, we allow the alias 1324314564Sdim // set tracker to handle regular (unordered) load/store dependencies. 1325314564Sdim if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) && 1326288943Sdim !isSafeToSpeculativelyExecute(BaseInst)) || 1327314564Sdim (!isUnorderedLoadStore(RootInst) && 1328288943Sdim !isSafeToSpeculativelyExecute(RootInst)))) { 1329341825Sdim LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst 1330341825Sdim << " vs. " << *RootInst 1331341825Sdim << " (side effects prevent reordering)\n"); 1332288943Sdim return false; 1333259698Sdim } 1334259698Sdim 1335259698Sdim // For instructions that are part of a reduction, if the operation is 1336259698Sdim // associative, then don't bother matching the operands (because we 1337259698Sdim // already know that the instructions are isomorphic, and the order 1338259698Sdim // within the iteration does not matter). For non-associative reductions, 1339259698Sdim // we do need to match the operands, because we need to reject 1340259698Sdim // out-of-order instructions within an iteration! 1341259698Sdim // For example (assume floating-point addition), we need to reject this: 1342259698Sdim // x += a[i]; x += b[i]; 1343259698Sdim // x += a[i+1]; x += b[i+1]; 1344259698Sdim // x += b[i+2]; x += a[i+2]; 1345288943Sdim bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); 1346259698Sdim 1347288943Sdim if (!(InReduction && BaseInst->isAssociative())) { 1348276479Sdim bool Swapped = false, SomeOpMatched = false; 1349288943Sdim for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { 1350288943Sdim Value *Op2 = RootInst->getOperand(j); 1351259698Sdim 1352280031Sdim // If this is part of a reduction (and the operation is not 1353280031Sdim // associatve), then we match all operands, but not those that are 1354280031Sdim // part of the reduction. 1355259698Sdim if (InReduction) 1356259698Sdim if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) 1357288943Sdim if (Reductions.isPairInSame(RootInst, Op2I)) 1358259698Sdim continue; 1359259698Sdim 1360259698Sdim DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); 1361288943Sdim if (BMI != BaseMap.end()) { 1362259698Sdim Op2 = BMI->second; 1363288943Sdim } else { 1364288943Sdim for (auto &DRS : RootSets) { 1365288943Sdim if (DRS.Roots[Iter-1] == (Instruction*) Op2) { 1366288943Sdim Op2 = DRS.BaseInst; 1367288943Sdim break; 1368288943Sdim } 1369288943Sdim } 1370288943Sdim } 1371259698Sdim 1372288943Sdim if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { 1373280031Sdim // If we've not already decided to swap the matched operands, and 1374280031Sdim // we've not already matched our first operand (note that we could 1375280031Sdim // have skipped matching the first operand because it is part of a 1376280031Sdim // reduction above), and the instruction is commutative, then try 1377280031Sdim // the swapped match. 1378288943Sdim if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && 1379288943Sdim BaseInst->getOperand(!j) == Op2) { 1380259698Sdim Swapped = true; 1381259698Sdim } else { 1382341825Sdim LLVM_DEBUG(dbgs() 1383341825Sdim << "LRR: iteration root match failed at " << *BaseInst 1384341825Sdim << " vs. " << *RootInst << " (operand " << j << ")\n"); 1385288943Sdim return false; 1386259698Sdim } 1387259698Sdim } 1388259698Sdim 1389259698Sdim SomeOpMatched = true; 1390259698Sdim } 1391259698Sdim } 1392259698Sdim 1393288943Sdim if ((!PossibleRedLastSet.count(BaseInst) && 1394288943Sdim hasUsesOutsideLoop(BaseInst, L)) || 1395288943Sdim (!PossibleRedLastSet.count(RootInst) && 1396288943Sdim hasUsesOutsideLoop(RootInst, L))) { 1397341825Sdim LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst 1398341825Sdim << " vs. " << *RootInst << " (uses outside loop)\n"); 1399288943Sdim return false; 1400259698Sdim } 1401259698Sdim 1402288943Sdim Reductions.recordPair(BaseInst, RootInst, Iter); 1403288943Sdim BaseMap.insert(std::make_pair(RootInst, BaseInst)); 1404259698Sdim 1405288943Sdim LastRootIt = RootIt; 1406288943Sdim Visited.insert(BaseInst); 1407288943Sdim Visited.insert(RootInst); 1408288943Sdim BaseIt = nextInstr(0, Uses, Visited); 1409288943Sdim RootIt = nextInstr(Iter, Uses, Visited); 1410259698Sdim } 1411327952Sdim assert(BaseIt == Uses.end() && RootIt == Uses.end() && 1412327952Sdim "Mismatched set sizes!"); 1413259698Sdim } 1414259698Sdim 1415341825Sdim LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV 1416341825Sdim << "\n"); 1417259698Sdim 1418288943Sdim return true; 1419288943Sdim} 1420259698Sdim 1421341825Sdimvoid LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) { 1422288943Sdim BasicBlock *Header = L->getHeader(); 1423341825Sdim 1424341825Sdim // Compute the start and increment for each BaseInst before we start erasing 1425341825Sdim // instructions. 1426341825Sdim SmallVector<const SCEV *, 8> StartExprs; 1427341825Sdim SmallVector<const SCEV *, 8> IncrExprs; 1428341825Sdim for (auto &DRS : RootSets) { 1429341825Sdim const SCEVAddRecExpr *IVSCEV = 1430341825Sdim cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); 1431341825Sdim StartExprs.push_back(IVSCEV->getStart()); 1432341825Sdim IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV)); 1433341825Sdim } 1434341825Sdim 1435259698Sdim // Remove instructions associated with non-base iterations. 1436314564Sdim for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend(); 1437314564Sdim J != JE;) { 1438288943Sdim unsigned I = Uses[&*J].find_first(); 1439288943Sdim if (I > 0 && I < IL_All) { 1440341825Sdim LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n"); 1441314564Sdim J++->eraseFromParent(); 1442259698Sdim continue; 1443259698Sdim } 1444259698Sdim 1445280031Sdim ++J; 1446259698Sdim } 1447259698Sdim 1448341825Sdim // Rewrite each BaseInst using SCEV. 1449341825Sdim for (size_t i = 0, e = RootSets.size(); i != e; ++i) 1450341825Sdim // Insert the new induction variable. 1451341825Sdim replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]); 1452265925Sdim 1453341825Sdim { // Limit the lifetime of SCEVExpander. 1454341825Sdim BranchInst *BI = cast<BranchInst>(Header->getTerminator()); 1455341825Sdim const DataLayout &DL = Header->getModule()->getDataLayout(); 1456341825Sdim SCEVExpander Expander(*SE, DL, "reroll"); 1457341825Sdim auto Zero = SE->getZero(BackedgeTakenCount->getType()); 1458341825Sdim auto One = SE->getOne(BackedgeTakenCount->getType()); 1459341825Sdim auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap); 1460341825Sdim Value *NewIV = 1461341825Sdim Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(), 1462341825Sdim Header->getFirstNonPHIOrDbg()); 1463341825Sdim // FIXME: This arithmetic can overflow. 1464341825Sdim auto TripCount = SE->getAddExpr(BackedgeTakenCount, One); 1465341825Sdim auto ScaledTripCount = SE->getMulExpr( 1466341825Sdim TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale)); 1467341825Sdim auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One); 1468341825Sdim Value *TakenCount = 1469341825Sdim Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(), 1470341825Sdim Header->getFirstNonPHIOrDbg()); 1471341825Sdim Value *Cond = 1472341825Sdim new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond"); 1473341825Sdim BI->setCondition(Cond); 1474259698Sdim 1475341825Sdim if (BI->getSuccessor(1) != Header) 1476341825Sdim BI->swapSuccessors(); 1477341825Sdim } 1478341825Sdim 1479309124Sdim SimplifyInstructionsInBlock(Header, TLI); 1480309124Sdim DeleteDeadPHIs(Header, TLI); 1481309124Sdim} 1482265925Sdim 1483341825Sdimvoid LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS, 1484341825Sdim const SCEV *Start, 1485341825Sdim const SCEV *IncrExpr) { 1486309124Sdim BasicBlock *Header = L->getHeader(); 1487341825Sdim Instruction *Inst = DRS.BaseInst; 1488309124Sdim 1489309124Sdim const SCEV *NewIVSCEV = 1490309124Sdim SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap); 1491309124Sdim 1492309124Sdim { // Limit the lifetime of SCEVExpander. 1493309124Sdim const DataLayout &DL = Header->getModule()->getDataLayout(); 1494309124Sdim SCEVExpander Expander(*SE, DL, "reroll"); 1495314564Sdim Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(), 1496314564Sdim Header->getFirstNonPHIOrDbg()); 1497309124Sdim 1498309124Sdim for (auto &KV : Uses) 1499309124Sdim if (KV.second.find_first() == 0) 1500309124Sdim KV.first->replaceUsesOfWith(Inst, NewIV); 1501259698Sdim } 1502288943Sdim} 1503288943Sdim 1504288943Sdim// Validate the selected reductions. All iterations must have an isomorphic 1505288943Sdim// part of the reduction chain and, for non-associative reductions, the chain 1506288943Sdim// entries must appear in order. 1507288943Sdimbool LoopReroll::ReductionTracker::validateSelected() { 1508288943Sdim // For a non-associative reduction, the chain entries must appear in order. 1509309124Sdim for (int i : Reds) { 1510288943Sdim int PrevIter = 0, BaseCount = 0, Count = 0; 1511288943Sdim for (Instruction *J : PossibleReds[i]) { 1512288943Sdim // Note that all instructions in the chain must have been found because 1513288943Sdim // all instructions in the function must have been assigned to some 1514288943Sdim // iteration. 1515288943Sdim int Iter = PossibleRedIter[J]; 1516288943Sdim if (Iter != PrevIter && Iter != PrevIter + 1 && 1517288943Sdim !PossibleReds[i].getReducedValue()->isAssociative()) { 1518341825Sdim LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " 1519341825Sdim << J << "\n"); 1520288943Sdim return false; 1521288943Sdim } 1522288943Sdim 1523288943Sdim if (Iter != PrevIter) { 1524288943Sdim if (Count != BaseCount) { 1525341825Sdim LLVM_DEBUG(dbgs() 1526341825Sdim << "LRR: Iteration " << PrevIter << " reduction use count " 1527341825Sdim << Count << " is not equal to the base use count " 1528341825Sdim << BaseCount << "\n"); 1529288943Sdim return false; 1530288943Sdim } 1531288943Sdim 1532288943Sdim Count = 0; 1533288943Sdim } 1534288943Sdim 1535288943Sdim ++Count; 1536288943Sdim if (Iter == 0) 1537288943Sdim ++BaseCount; 1538288943Sdim 1539288943Sdim PrevIter = Iter; 1540288943Sdim } 1541288943Sdim } 1542288943Sdim 1543288943Sdim return true; 1544288943Sdim} 1545288943Sdim 1546288943Sdim// For all selected reductions, remove all parts except those in the first 1547288943Sdim// iteration (and the PHI). Replace outside uses of the reduced value with uses 1548288943Sdim// of the first-iteration reduced value (in other words, reroll the selected 1549288943Sdim// reductions). 1550288943Sdimvoid LoopReroll::ReductionTracker::replaceSelected() { 1551288943Sdim // Fixup reductions to refer to the last instruction associated with the 1552288943Sdim // first iteration (not the last). 1553309124Sdim for (int i : Reds) { 1554288943Sdim int j = 0; 1555288943Sdim for (int e = PossibleReds[i].size(); j != e; ++j) 1556288943Sdim if (PossibleRedIter[PossibleReds[i][j]] != 0) { 1557288943Sdim --j; 1558288943Sdim break; 1559288943Sdim } 1560288943Sdim 1561288943Sdim // Replace users with the new end-of-chain value. 1562288943Sdim SmallInstructionVector Users; 1563288943Sdim for (User *U : PossibleReds[i].getReducedValue()->users()) { 1564288943Sdim Users.push_back(cast<Instruction>(U)); 1565288943Sdim } 1566288943Sdim 1567309124Sdim for (Instruction *User : Users) 1568309124Sdim User->replaceUsesOfWith(PossibleReds[i].getReducedValue(), 1569288943Sdim PossibleReds[i][j]); 1570288943Sdim } 1571288943Sdim} 1572288943Sdim 1573288943Sdim// Reroll the provided loop with respect to the provided induction variable. 1574288943Sdim// Generally, we're looking for a loop like this: 1575288943Sdim// 1576288943Sdim// %iv = phi [ (preheader, ...), (body, %iv.next) ] 1577288943Sdim// f(%iv) 1578288943Sdim// %iv.1 = add %iv, 1 <-- a root increment 1579288943Sdim// f(%iv.1) 1580288943Sdim// %iv.2 = add %iv, 2 <-- a root increment 1581288943Sdim// f(%iv.2) 1582288943Sdim// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 1583288943Sdim// f(%iv.scale_m_1) 1584288943Sdim// ... 1585288943Sdim// %iv.next = add %iv, scale 1586288943Sdim// %cmp = icmp(%iv, ...) 1587288943Sdim// br %cmp, header, exit 1588288943Sdim// 1589288943Sdim// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of 1590288943Sdim// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can 1591288943Sdim// be intermixed with eachother. The restriction imposed by this algorithm is 1592288943Sdim// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), 1593288943Sdim// etc. be the same. 1594288943Sdim// 1595288943Sdim// First, we collect the use set of %iv, excluding the other increment roots. 1596288943Sdim// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) 1597288943Sdim// times, having collected the use set of f(%iv.(i+1)), during which we: 1598288943Sdim// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to 1599288943Sdim// the next unmatched instruction in f(%iv.(i+1)). 1600288943Sdim// - Ensure that both matched instructions don't have any external users 1601288943Sdim// (with the exception of last-in-chain reduction instructions). 1602288943Sdim// - Track the (aliasing) write set, and other side effects, of all 1603288943Sdim// instructions that belong to future iterations that come before the matched 1604288943Sdim// instructions. If the matched instructions read from that write set, then 1605288943Sdim// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in 1606288943Sdim// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, 1607288943Sdim// if any of these future instructions had side effects (could not be 1608288943Sdim// speculatively executed), and so do the matched instructions, when we 1609288943Sdim// cannot reorder those side-effect-producing instructions, and rerolling 1610288943Sdim// fails. 1611288943Sdim// 1612288943Sdim// Finally, we make sure that all loop instructions are either loop increment 1613288943Sdim// roots, belong to simple latch code, parts of validated reductions, part of 1614288943Sdim// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions 1615288943Sdim// have been validated), then we reroll the loop. 1616288943Sdimbool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, 1617341825Sdim const SCEV *BackedgeTakenCount, 1618288943Sdim ReductionTracker &Reductions) { 1619296417Sdim DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, 1620309124Sdim IVToIncMap, LoopControlIV); 1621288943Sdim 1622288943Sdim if (!DAGRoots.findRoots()) 1623288943Sdim return false; 1624341825Sdim LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV 1625341825Sdim << "\n"); 1626296417Sdim 1627288943Sdim if (!DAGRoots.validate(Reductions)) 1628288943Sdim return false; 1629288943Sdim if (!Reductions.validateSelected()) 1630288943Sdim return false; 1631288943Sdim // At this point, we've validated the rerolling, and we're committed to 1632288943Sdim // making changes! 1633288943Sdim 1634288943Sdim Reductions.replaceSelected(); 1635341825Sdim DAGRoots.replace(BackedgeTakenCount); 1636288943Sdim 1637259698Sdim ++NumRerolledLoops; 1638259698Sdim return true; 1639259698Sdim} 1640259698Sdim 1641259698Sdimbool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { 1642309124Sdim if (skipLoop(L)) 1643276479Sdim return false; 1644276479Sdim 1645296417Sdim AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1646288943Sdim LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1647296417Sdim SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1648360784Sdim TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( 1649360784Sdim *L->getHeader()->getParent()); 1650276479Sdim DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1651296417Sdim PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 1652259698Sdim 1653259698Sdim BasicBlock *Header = L->getHeader(); 1654341825Sdim LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %" 1655341825Sdim << Header->getName() << " (" << L->getNumBlocks() 1656341825Sdim << " block(s))\n"); 1657259698Sdim 1658259698Sdim // For now, we'll handle only single BB loops. 1659259698Sdim if (L->getNumBlocks() > 1) 1660309124Sdim return false; 1661259698Sdim 1662259698Sdim if (!SE->hasLoopInvariantBackedgeTakenCount(L)) 1663309124Sdim return false; 1664259698Sdim 1665341825Sdim const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); 1666341825Sdim LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); 1667341825Sdim LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount 1668341825Sdim << "\n"); 1669259698Sdim 1670259698Sdim // First, we need to find the induction variable with respect to which we can 1671259698Sdim // reroll (there may be several possible options). 1672259698Sdim SmallInstructionVector PossibleIVs; 1673296417Sdim IVToIncMap.clear(); 1674309124Sdim LoopControlIV = nullptr; 1675259698Sdim collectPossibleIVs(L, PossibleIVs); 1676259698Sdim 1677259698Sdim if (PossibleIVs.empty()) { 1678341825Sdim LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n"); 1679309124Sdim return false; 1680259698Sdim } 1681259698Sdim 1682259698Sdim ReductionTracker Reductions; 1683259698Sdim collectPossibleReductions(L, Reductions); 1684309124Sdim bool Changed = false; 1685259698Sdim 1686259698Sdim // For each possible IV, collect the associated possible set of 'root' nodes 1687259698Sdim // (i+1, i+2, etc.). 1688309124Sdim for (Instruction *PossibleIV : PossibleIVs) 1689341825Sdim if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) { 1690259698Sdim Changed = true; 1691259698Sdim break; 1692259698Sdim } 1693341825Sdim LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); 1694259698Sdim 1695309124Sdim // Trip count of L has changed so SE must be re-evaluated. 1696309124Sdim if (Changed) 1697309124Sdim SE->forgetLoop(L); 1698309124Sdim 1699259698Sdim return Changed; 1700259698Sdim} 1701