1//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop reroller.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/APInt.h"
14#include "llvm/ADT/BitVector.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/DenseSet.h"
17#include "llvm/ADT/MapVector.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/SmallPtrSet.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/Statistic.h"
22#include "llvm/Analysis/AliasAnalysis.h"
23#include "llvm/Analysis/AliasSetTracker.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/LoopPass.h"
26#include "llvm/Analysis/ScalarEvolution.h"
27#include "llvm/Analysis/ScalarEvolutionExpander.h"
28#include "llvm/Analysis/ScalarEvolutionExpressions.h"
29#include "llvm/Analysis/TargetLibraryInfo.h"
30#include "llvm/Analysis/ValueTracking.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Constants.h"
33#include "llvm/IR/DataLayout.h"
34#include "llvm/IR/DerivedTypes.h"
35#include "llvm/IR/Dominators.h"
36#include "llvm/IR/IRBuilder.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/IntrinsicInst.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Module.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Use.h"
45#include "llvm/IR/User.h"
46#include "llvm/IR/Value.h"
47#include "llvm/InitializePasses.h"
48#include "llvm/Pass.h"
49#include "llvm/Support/Casting.h"
50#include "llvm/Support/CommandLine.h"
51#include "llvm/Support/Debug.h"
52#include "llvm/Support/raw_ostream.h"
53#include "llvm/Transforms/Scalar.h"
54#include "llvm/Transforms/Utils.h"
55#include "llvm/Transforms/Utils/BasicBlockUtils.h"
56#include "llvm/Transforms/Utils/Local.h"
57#include "llvm/Transforms/Utils/LoopUtils.h"
58#include <cassert>
59#include <cstddef>
60#include <cstdint>
61#include <cstdlib>
62#include <iterator>
63#include <map>
64#include <utility>
65
66using namespace llvm;
67
68#define DEBUG_TYPE "loop-reroll"
69
70STATISTIC(NumRerolledLoops, "Number of rerolled loops");
71
72static cl::opt<unsigned>
73NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
74                          cl::Hidden,
75                          cl::desc("The maximum number of failures to tolerate"
76                                   " during fuzzy matching. (default: 400)"));
77
78// This loop re-rolling transformation aims to transform loops like this:
79//
80// int foo(int a);
81// void bar(int *x) {
82//   for (int i = 0; i < 500; i += 3) {
83//     foo(i);
84//     foo(i+1);
85//     foo(i+2);
86//   }
87// }
88//
89// into a loop like this:
90//
91// void bar(int *x) {
92//   for (int i = 0; i < 500; ++i)
93//     foo(i);
94// }
95//
96// It does this by looking for loops that, besides the latch code, are composed
97// of isomorphic DAGs of instructions, with each DAG rooted at some increment
98// to the induction variable, and where each DAG is isomorphic to the DAG
99// rooted at the induction variable (excepting the sub-DAGs which root the
100// other induction-variable increments). In other words, we're looking for loop
101// bodies of the form:
102//
103// %iv = phi [ (preheader, ...), (body, %iv.next) ]
104// f(%iv)
105// %iv.1 = add %iv, 1                <-- a root increment
106// f(%iv.1)
107// %iv.2 = add %iv, 2                <-- a root increment
108// f(%iv.2)
109// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
110// f(%iv.scale_m_1)
111// ...
112// %iv.next = add %iv, scale
113// %cmp = icmp(%iv, ...)
114// br %cmp, header, exit
115//
116// where each f(i) is a set of instructions that, collectively, are a function
117// only of i (and other loop-invariant values).
118//
119// As a special case, we can also reroll loops like this:
120//
121// int foo(int);
122// void bar(int *x) {
123//   for (int i = 0; i < 500; ++i) {
124//     x[3*i] = foo(0);
125//     x[3*i+1] = foo(0);
126//     x[3*i+2] = foo(0);
127//   }
128// }
129//
130// into this:
131//
132// void bar(int *x) {
133//   for (int i = 0; i < 1500; ++i)
134//     x[i] = foo(0);
135// }
136//
137// in which case, we're looking for inputs like this:
138//
139// %iv = phi [ (preheader, ...), (body, %iv.next) ]
140// %scaled.iv = mul %iv, scale
141// f(%scaled.iv)
142// %scaled.iv.1 = add %scaled.iv, 1
143// f(%scaled.iv.1)
144// %scaled.iv.2 = add %scaled.iv, 2
145// f(%scaled.iv.2)
146// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
147// f(%scaled.iv.scale_m_1)
148// ...
149// %iv.next = add %iv, 1
150// %cmp = icmp(%iv, ...)
151// br %cmp, header, exit
152
153namespace {
154
155  enum IterationLimits {
156    /// The maximum number of iterations that we'll try and reroll.
157    IL_MaxRerollIterations = 32,
158    /// The bitvector index used by loop induction variables and other
159    /// instructions that belong to all iterations.
160    IL_All,
161    IL_End
162  };
163
164  class LoopReroll : public LoopPass {
165  public:
166    static char ID; // Pass ID, replacement for typeid
167
168    LoopReroll() : LoopPass(ID) {
169      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
170    }
171
172    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
173
174    void getAnalysisUsage(AnalysisUsage &AU) const override {
175      AU.addRequired<TargetLibraryInfoWrapperPass>();
176      getLoopAnalysisUsage(AU);
177    }
178
179  protected:
180    AliasAnalysis *AA;
181    LoopInfo *LI;
182    ScalarEvolution *SE;
183    TargetLibraryInfo *TLI;
184    DominatorTree *DT;
185    bool PreserveLCSSA;
186
187    using SmallInstructionVector = SmallVector<Instruction *, 16>;
188    using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
189
190    // Map between induction variable and its increment
191    DenseMap<Instruction *, int64_t> IVToIncMap;
192
193    // For loop with multiple induction variable, remember the one used only to
194    // control the loop.
195    Instruction *LoopControlIV;
196
197    // A chain of isomorphic instructions, identified by a single-use PHI
198    // representing a reduction. Only the last value may be used outside the
199    // loop.
200    struct SimpleLoopReduction {
201      SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
202        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
203        add(L);
204      }
205
206      bool valid() const {
207        return Valid;
208      }
209
210      Instruction *getPHI() const {
211        assert(Valid && "Using invalid reduction");
212        return Instructions.front();
213      }
214
215      Instruction *getReducedValue() const {
216        assert(Valid && "Using invalid reduction");
217        return Instructions.back();
218      }
219
220      Instruction *get(size_t i) const {
221        assert(Valid && "Using invalid reduction");
222        return Instructions[i+1];
223      }
224
225      Instruction *operator [] (size_t i) const { return get(i); }
226
227      // The size, ignoring the initial PHI.
228      size_t size() const {
229        assert(Valid && "Using invalid reduction");
230        return Instructions.size()-1;
231      }
232
233      using iterator = SmallInstructionVector::iterator;
234      using const_iterator = SmallInstructionVector::const_iterator;
235
236      iterator begin() {
237        assert(Valid && "Using invalid reduction");
238        return std::next(Instructions.begin());
239      }
240
241      const_iterator begin() const {
242        assert(Valid && "Using invalid reduction");
243        return std::next(Instructions.begin());
244      }
245
246      iterator end() { return Instructions.end(); }
247      const_iterator end() const { return Instructions.end(); }
248
249    protected:
250      bool Valid = false;
251      SmallInstructionVector Instructions;
252
253      void add(Loop *L);
254    };
255
256    // The set of all reductions, and state tracking of possible reductions
257    // during loop instruction processing.
258    struct ReductionTracker {
259      using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
260
261      // Add a new possible reduction.
262      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
263
264      // Setup to track possible reductions corresponding to the provided
265      // rerolling scale. Only reductions with a number of non-PHI instructions
266      // that is divisible by the scale are considered. Three instructions sets
267      // are filled in:
268      //   - A set of all possible instructions in eligible reductions.
269      //   - A set of all PHIs in eligible reductions
270      //   - A set of all reduced values (last instructions) in eligible
271      //     reductions.
272      void restrictToScale(uint64_t Scale,
273                           SmallInstructionSet &PossibleRedSet,
274                           SmallInstructionSet &PossibleRedPHISet,
275                           SmallInstructionSet &PossibleRedLastSet) {
276        PossibleRedIdx.clear();
277        PossibleRedIter.clear();
278        Reds.clear();
279
280        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
281          if (PossibleReds[i].size() % Scale == 0) {
282            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
283            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
284
285            PossibleRedSet.insert(PossibleReds[i].getPHI());
286            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
287            for (Instruction *J : PossibleReds[i]) {
288              PossibleRedSet.insert(J);
289              PossibleRedIdx[J] = i;
290            }
291          }
292      }
293
294      // The functions below are used while processing the loop instructions.
295
296      // Are the two instructions both from reductions, and furthermore, from
297      // the same reduction?
298      bool isPairInSame(Instruction *J1, Instruction *J2) {
299        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
300        if (J1I != PossibleRedIdx.end()) {
301          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
302          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
303            return true;
304        }
305
306        return false;
307      }
308
309      // The two provided instructions, the first from the base iteration, and
310      // the second from iteration i, form a matched pair. If these are part of
311      // a reduction, record that fact.
312      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
313        if (PossibleRedIdx.count(J1)) {
314          assert(PossibleRedIdx.count(J2) &&
315                 "Recording reduction vs. non-reduction instruction?");
316
317          PossibleRedIter[J1] = 0;
318          PossibleRedIter[J2] = i;
319
320          int Idx = PossibleRedIdx[J1];
321          assert(Idx == PossibleRedIdx[J2] &&
322                 "Recording pair from different reductions?");
323          Reds.insert(Idx);
324        }
325      }
326
327      // The functions below can be called after we've finished processing all
328      // instructions in the loop, and we know which reductions were selected.
329
330      bool validateSelected();
331      void replaceSelected();
332
333    protected:
334      // The vector of all possible reductions (for any scale).
335      SmallReductionVector PossibleReds;
336
337      DenseMap<Instruction *, int> PossibleRedIdx;
338      DenseMap<Instruction *, int> PossibleRedIter;
339      DenseSet<int> Reds;
340    };
341
342    // A DAGRootSet models an induction variable being used in a rerollable
343    // loop. For example,
344    //
345    //   x[i*3+0] = y1
346    //   x[i*3+1] = y2
347    //   x[i*3+2] = y3
348    //
349    //   Base instruction -> i*3
350    //                    +---+----+
351    //                   /    |     \
352    //               ST[y1]  +1     +2  <-- Roots
353    //                        |      |
354    //                      ST[y2] ST[y3]
355    //
356    // There may be multiple DAGRoots, for example:
357    //
358    //   x[i*2+0] = ...   (1)
359    //   x[i*2+1] = ...   (1)
360    //   x[i*2+4] = ...   (2)
361    //   x[i*2+5] = ...   (2)
362    //   x[(i+1234)*2+5678] = ... (3)
363    //   x[(i+1234)*2+5679] = ... (3)
364    //
365    // The loop will be rerolled by adding a new loop induction variable,
366    // one for the Base instruction in each DAGRootSet.
367    //
368    struct DAGRootSet {
369      Instruction *BaseInst;
370      SmallInstructionVector Roots;
371
372      // The instructions between IV and BaseInst (but not including BaseInst).
373      SmallInstructionSet SubsumedInsts;
374    };
375
376    // The set of all DAG roots, and state tracking of all roots
377    // for a particular induction variable.
378    struct DAGRootTracker {
379      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
380                     ScalarEvolution *SE, AliasAnalysis *AA,
381                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
382                     bool PreserveLCSSA,
383                     DenseMap<Instruction *, int64_t> &IncrMap,
384                     Instruction *LoopCtrlIV)
385          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
386            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
387            LoopControlIV(LoopCtrlIV) {}
388
389      /// Stage 1: Find all the DAG roots for the induction variable.
390      bool findRoots();
391
392      /// Stage 2: Validate if the found roots are valid.
393      bool validate(ReductionTracker &Reductions);
394
395      /// Stage 3: Assuming validate() returned true, perform the
396      /// replacement.
397      /// @param BackedgeTakenCount The backedge-taken count of L.
398      void replace(const SCEV *BackedgeTakenCount);
399
400    protected:
401      using UsesTy = MapVector<Instruction *, BitVector>;
402
403      void findRootsRecursive(Instruction *IVU,
404                              SmallInstructionSet SubsumedInsts);
405      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
406      bool collectPossibleRoots(Instruction *Base,
407                                std::map<int64_t,Instruction*> &Roots);
408      bool validateRootSet(DAGRootSet &DRS);
409
410      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
411      void collectInLoopUserSet(const SmallInstructionVector &Roots,
412                                const SmallInstructionSet &Exclude,
413                                const SmallInstructionSet &Final,
414                                DenseSet<Instruction *> &Users);
415      void collectInLoopUserSet(Instruction *Root,
416                                const SmallInstructionSet &Exclude,
417                                const SmallInstructionSet &Final,
418                                DenseSet<Instruction *> &Users);
419
420      UsesTy::iterator nextInstr(int Val, UsesTy &In,
421                                 const SmallInstructionSet &Exclude,
422                                 UsesTy::iterator *StartI=nullptr);
423      bool isBaseInst(Instruction *I);
424      bool isRootInst(Instruction *I);
425      bool instrDependsOn(Instruction *I,
426                          UsesTy::iterator Start,
427                          UsesTy::iterator End);
428      void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
429
430      LoopReroll *Parent;
431
432      // Members of Parent, replicated here for brevity.
433      Loop *L;
434      ScalarEvolution *SE;
435      AliasAnalysis *AA;
436      TargetLibraryInfo *TLI;
437      DominatorTree *DT;
438      LoopInfo *LI;
439      bool PreserveLCSSA;
440
441      // The loop induction variable.
442      Instruction *IV;
443
444      // Loop step amount.
445      int64_t Inc;
446
447      // Loop reroll count; if Inc == 1, this records the scaling applied
448      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
449      // If Inc is not 1, Scale = Inc.
450      uint64_t Scale;
451
452      // The roots themselves.
453      SmallVector<DAGRootSet,16> RootSets;
454
455      // All increment instructions for IV.
456      SmallInstructionVector LoopIncs;
457
458      // Map of all instructions in the loop (in order) to the iterations
459      // they are used in (or specially, IL_All for instructions
460      // used in the loop increment mechanism).
461      UsesTy Uses;
462
463      // Map between induction variable and its increment
464      DenseMap<Instruction *, int64_t> &IVToIncMap;
465
466      Instruction *LoopControlIV;
467    };
468
469    // Check if it is a compare-like instruction whose user is a branch
470    bool isCompareUsedByBranch(Instruction *I) {
471      auto *TI = I->getParent()->getTerminator();
472      if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
473        return false;
474      return I->hasOneUse() && TI->getOperand(0) == I;
475    };
476
477    bool isLoopControlIV(Loop *L, Instruction *IV);
478    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
479    void collectPossibleReductions(Loop *L,
480           ReductionTracker &Reductions);
481    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
482                const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
483  };
484
485} // end anonymous namespace
486
487char LoopReroll::ID = 0;
488
489INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
490INITIALIZE_PASS_DEPENDENCY(LoopPass)
491INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
492INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
493
494Pass *llvm::createLoopRerollPass() {
495  return new LoopReroll;
496}
497
498// Returns true if the provided instruction is used outside the given loop.
499// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
500// non-loop blocks to be outside the loop.
501static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
502  for (User *U : I->users()) {
503    if (!L->contains(cast<Instruction>(U)))
504      return true;
505  }
506  return false;
507}
508
509// Check if an IV is only used to control the loop. There are two cases:
510// 1. It only has one use which is loop increment, and the increment is only
511// used by comparison and the PHI (could has sext with nsw in between), and the
512// comparison is only used by branch.
513// 2. It is used by loop increment and the comparison, the loop increment is
514// only used by the PHI, and the comparison is used only by the branch.
515bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
516  unsigned IVUses = IV->getNumUses();
517  if (IVUses != 2 && IVUses != 1)
518    return false;
519
520  for (auto *User : IV->users()) {
521    int32_t IncOrCmpUses = User->getNumUses();
522    bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
523
524    // User can only have one or two uses.
525    if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
526      return false;
527
528    // Case 1
529    if (IVUses == 1) {
530      // The only user must be the loop increment.
531      // The loop increment must have two uses.
532      if (IsCompInst || IncOrCmpUses != 2)
533        return false;
534    }
535
536    // Case 2
537    if (IVUses == 2 && IncOrCmpUses != 1)
538      return false;
539
540    // The users of the IV must be a binary operation or a comparison
541    if (auto *BO = dyn_cast<BinaryOperator>(User)) {
542      if (BO->getOpcode() == Instruction::Add) {
543        // Loop Increment
544        // User of Loop Increment should be either PHI or CMP
545        for (auto *UU : User->users()) {
546          if (PHINode *PN = dyn_cast<PHINode>(UU)) {
547            if (PN != IV)
548              return false;
549          }
550          // Must be a CMP or an ext (of a value with nsw) then CMP
551          else {
552            Instruction *UUser = dyn_cast<Instruction>(UU);
553            // Skip SExt if we are extending an nsw value
554            // TODO: Allow ZExt too
555            if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
556                isa<SExtInst>(UUser))
557              UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
558            if (!isCompareUsedByBranch(UUser))
559              return false;
560          }
561        }
562      } else
563        return false;
564      // Compare : can only have one use, and must be branch
565    } else if (!IsCompInst)
566      return false;
567  }
568  return true;
569}
570
571// Collect the list of loop induction variables with respect to which it might
572// be possible to reroll the loop.
573void LoopReroll::collectPossibleIVs(Loop *L,
574                                    SmallInstructionVector &PossibleIVs) {
575  BasicBlock *Header = L->getHeader();
576  for (BasicBlock::iterator I = Header->begin(),
577       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
578    if (!isa<PHINode>(I))
579      continue;
580    if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy())
581      continue;
582
583    if (const SCEVAddRecExpr *PHISCEV =
584            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
585      if (PHISCEV->getLoop() != L)
586        continue;
587      if (!PHISCEV->isAffine())
588        continue;
589      auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
590      if (IncSCEV) {
591        IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
592        LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
593                          << "\n");
594
595        if (isLoopControlIV(L, &*I)) {
596          assert(!LoopControlIV && "Found two loop control only IV");
597          LoopControlIV = &(*I);
598          LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I
599                            << " = " << *PHISCEV << "\n");
600        } else
601          PossibleIVs.push_back(&*I);
602      }
603    }
604  }
605}
606
607// Add the remainder of the reduction-variable chain to the instruction vector
608// (the initial PHINode has already been added). If successful, the object is
609// marked as valid.
610void LoopReroll::SimpleLoopReduction::add(Loop *L) {
611  assert(!Valid && "Cannot add to an already-valid chain");
612
613  // The reduction variable must be a chain of single-use instructions
614  // (including the PHI), except for the last value (which is used by the PHI
615  // and also outside the loop).
616  Instruction *C = Instructions.front();
617  if (C->user_empty())
618    return;
619
620  do {
621    C = cast<Instruction>(*C->user_begin());
622    if (C->hasOneUse()) {
623      if (!C->isBinaryOp())
624        return;
625
626      if (!(isa<PHINode>(Instructions.back()) ||
627            C->isSameOperationAs(Instructions.back())))
628        return;
629
630      Instructions.push_back(C);
631    }
632  } while (C->hasOneUse());
633
634  if (Instructions.size() < 2 ||
635      !C->isSameOperationAs(Instructions.back()) ||
636      C->use_empty())
637    return;
638
639  // C is now the (potential) last instruction in the reduction chain.
640  for (User *U : C->users()) {
641    // The only in-loop user can be the initial PHI.
642    if (L->contains(cast<Instruction>(U)))
643      if (cast<Instruction>(U) != Instructions.front())
644        return;
645  }
646
647  Instructions.push_back(C);
648  Valid = true;
649}
650
651// Collect the vector of possible reduction variables.
652void LoopReroll::collectPossibleReductions(Loop *L,
653  ReductionTracker &Reductions) {
654  BasicBlock *Header = L->getHeader();
655  for (BasicBlock::iterator I = Header->begin(),
656       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
657    if (!isa<PHINode>(I))
658      continue;
659    if (!I->getType()->isSingleValueType())
660      continue;
661
662    SimpleLoopReduction SLR(&*I, L);
663    if (!SLR.valid())
664      continue;
665
666    LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
667                      << SLR.size() << " chained instructions)\n");
668    Reductions.addSLR(SLR);
669  }
670}
671
672// Collect the set of all users of the provided root instruction. This set of
673// users contains not only the direct users of the root instruction, but also
674// all users of those users, and so on. There are two exceptions:
675//
676//   1. Instructions in the set of excluded instructions are never added to the
677//   use set (even if they are users). This is used, for example, to exclude
678//   including root increments in the use set of the primary IV.
679//
680//   2. Instructions in the set of final instructions are added to the use set
681//   if they are users, but their users are not added. This is used, for
682//   example, to prevent a reduction update from forcing all later reduction
683//   updates into the use set.
684void LoopReroll::DAGRootTracker::collectInLoopUserSet(
685  Instruction *Root, const SmallInstructionSet &Exclude,
686  const SmallInstructionSet &Final,
687  DenseSet<Instruction *> &Users) {
688  SmallInstructionVector Queue(1, Root);
689  while (!Queue.empty()) {
690    Instruction *I = Queue.pop_back_val();
691    if (!Users.insert(I).second)
692      continue;
693
694    if (!Final.count(I))
695      for (Use &U : I->uses()) {
696        Instruction *User = cast<Instruction>(U.getUser());
697        if (PHINode *PN = dyn_cast<PHINode>(User)) {
698          // Ignore "wrap-around" uses to PHIs of this loop's header.
699          if (PN->getIncomingBlock(U) == L->getHeader())
700            continue;
701        }
702
703        if (L->contains(User) && !Exclude.count(User)) {
704          Queue.push_back(User);
705        }
706      }
707
708    // We also want to collect single-user "feeder" values.
709    for (User::op_iterator OI = I->op_begin(),
710         OIE = I->op_end(); OI != OIE; ++OI) {
711      if (Instruction *Op = dyn_cast<Instruction>(*OI))
712        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
713            !Final.count(Op))
714          Queue.push_back(Op);
715    }
716  }
717}
718
719// Collect all of the users of all of the provided root instructions (combined
720// into a single set).
721void LoopReroll::DAGRootTracker::collectInLoopUserSet(
722  const SmallInstructionVector &Roots,
723  const SmallInstructionSet &Exclude,
724  const SmallInstructionSet &Final,
725  DenseSet<Instruction *> &Users) {
726  for (Instruction *Root : Roots)
727    collectInLoopUserSet(Root, Exclude, Final, Users);
728}
729
730static bool isUnorderedLoadStore(Instruction *I) {
731  if (LoadInst *LI = dyn_cast<LoadInst>(I))
732    return LI->isUnordered();
733  if (StoreInst *SI = dyn_cast<StoreInst>(I))
734    return SI->isUnordered();
735  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
736    return !MI->isVolatile();
737  return false;
738}
739
740/// Return true if IVU is a "simple" arithmetic operation.
741/// This is used for narrowing the search space for DAGRoots; only arithmetic
742/// and GEPs can be part of a DAGRoot.
743static bool isSimpleArithmeticOp(User *IVU) {
744  if (Instruction *I = dyn_cast<Instruction>(IVU)) {
745    switch (I->getOpcode()) {
746    default: return false;
747    case Instruction::Add:
748    case Instruction::Sub:
749    case Instruction::Mul:
750    case Instruction::Shl:
751    case Instruction::AShr:
752    case Instruction::LShr:
753    case Instruction::GetElementPtr:
754    case Instruction::Trunc:
755    case Instruction::ZExt:
756    case Instruction::SExt:
757      return true;
758    }
759  }
760  return false;
761}
762
763static bool isLoopIncrement(User *U, Instruction *IV) {
764  BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
765
766  if ((BO && BO->getOpcode() != Instruction::Add) ||
767      (!BO && !isa<GetElementPtrInst>(U)))
768    return false;
769
770  for (auto *UU : U->users()) {
771    PHINode *PN = dyn_cast<PHINode>(UU);
772    if (PN && PN == IV)
773      return true;
774  }
775  return false;
776}
777
778bool LoopReroll::DAGRootTracker::
779collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
780  SmallInstructionVector BaseUsers;
781
782  for (auto *I : Base->users()) {
783    ConstantInt *CI = nullptr;
784
785    if (isLoopIncrement(I, IV)) {
786      LoopIncs.push_back(cast<Instruction>(I));
787      continue;
788    }
789
790    // The root nodes must be either GEPs, ORs or ADDs.
791    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
792      if (BO->getOpcode() == Instruction::Add ||
793          BO->getOpcode() == Instruction::Or)
794        CI = dyn_cast<ConstantInt>(BO->getOperand(1));
795    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
796      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
797      CI = dyn_cast<ConstantInt>(LastOperand);
798    }
799
800    if (!CI) {
801      if (Instruction *II = dyn_cast<Instruction>(I)) {
802        BaseUsers.push_back(II);
803        continue;
804      } else {
805        LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
806                          << "\n");
807        return false;
808      }
809    }
810
811    int64_t V = std::abs(CI->getValue().getSExtValue());
812    if (Roots.find(V) != Roots.end())
813      // No duplicates, please.
814      return false;
815
816    Roots[V] = cast<Instruction>(I);
817  }
818
819  // Make sure we have at least two roots.
820  if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
821    return false;
822
823  // If we found non-loop-inc, non-root users of Base, assume they are
824  // for the zeroth root index. This is because "add %a, 0" gets optimized
825  // away.
826  if (BaseUsers.size()) {
827    if (Roots.find(0) != Roots.end()) {
828      LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
829      return false;
830    }
831    Roots[0] = Base;
832  }
833
834  // Calculate the number of users of the base, or lowest indexed, iteration.
835  unsigned NumBaseUses = BaseUsers.size();
836  if (NumBaseUses == 0)
837    NumBaseUses = Roots.begin()->second->getNumUses();
838
839  // Check that every node has the same number of users.
840  for (auto &KV : Roots) {
841    if (KV.first == 0)
842      continue;
843    if (!KV.second->hasNUses(NumBaseUses)) {
844      LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
845                        << "#Base=" << NumBaseUses
846                        << ", #Root=" << KV.second->getNumUses() << "\n");
847      return false;
848    }
849  }
850
851  return true;
852}
853
854void LoopReroll::DAGRootTracker::
855findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
856  // Does the user look like it could be part of a root set?
857  // All its users must be simple arithmetic ops.
858  if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
859    return;
860
861  if (I != IV && findRootsBase(I, SubsumedInsts))
862    return;
863
864  SubsumedInsts.insert(I);
865
866  for (User *V : I->users()) {
867    Instruction *I = cast<Instruction>(V);
868    if (is_contained(LoopIncs, I))
869      continue;
870
871    if (!isSimpleArithmeticOp(I))
872      continue;
873
874    // The recursive call makes a copy of SubsumedInsts.
875    findRootsRecursive(I, SubsumedInsts);
876  }
877}
878
879bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
880  if (DRS.Roots.empty())
881    return false;
882
883  // Consider a DAGRootSet with N-1 roots (so N different values including
884  //   BaseInst).
885  // Define d = Roots[0] - BaseInst, which should be the same as
886  //   Roots[I] - Roots[I-1] for all I in [1..N).
887  // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
888  //   loop iteration J.
889  //
890  // Now, For the loop iterations to be consecutive:
891  //   D = d * N
892  const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
893  if (!ADR)
894    return false;
895
896  // Check that the first root is evenly spaced.
897  unsigned N = DRS.Roots.size() + 1;
898  const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
899  const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
900  if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
901    return false;
902
903  // Check that the remainling roots are evenly spaced.
904  for (unsigned i = 1; i < N - 1; ++i) {
905    const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]),
906                                               SE->getSCEV(DRS.Roots[i-1]));
907    if (NewStepSCEV != StepSCEV)
908      return false;
909  }
910
911  return true;
912}
913
914bool LoopReroll::DAGRootTracker::
915findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
916  // The base of a RootSet must be an AddRec, so it can be erased.
917  const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
918  if (!IVU_ADR || IVU_ADR->getLoop() != L)
919    return false;
920
921  std::map<int64_t, Instruction*> V;
922  if (!collectPossibleRoots(IVU, V))
923    return false;
924
925  // If we didn't get a root for index zero, then IVU must be
926  // subsumed.
927  if (V.find(0) == V.end())
928    SubsumedInsts.insert(IVU);
929
930  // Partition the vector into monotonically increasing indexes.
931  DAGRootSet DRS;
932  DRS.BaseInst = nullptr;
933
934  SmallVector<DAGRootSet, 16> PotentialRootSets;
935
936  for (auto &KV : V) {
937    if (!DRS.BaseInst) {
938      DRS.BaseInst = KV.second;
939      DRS.SubsumedInsts = SubsumedInsts;
940    } else if (DRS.Roots.empty()) {
941      DRS.Roots.push_back(KV.second);
942    } else if (V.find(KV.first - 1) != V.end()) {
943      DRS.Roots.push_back(KV.second);
944    } else {
945      // Linear sequence terminated.
946      if (!validateRootSet(DRS))
947        return false;
948
949      // Construct a new DAGRootSet with the next sequence.
950      PotentialRootSets.push_back(DRS);
951      DRS.BaseInst = KV.second;
952      DRS.Roots.clear();
953    }
954  }
955
956  if (!validateRootSet(DRS))
957    return false;
958
959  PotentialRootSets.push_back(DRS);
960
961  RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
962
963  return true;
964}
965
966bool LoopReroll::DAGRootTracker::findRoots() {
967  Inc = IVToIncMap[IV];
968
969  assert(RootSets.empty() && "Unclean state!");
970  if (std::abs(Inc) == 1) {
971    for (auto *IVU : IV->users()) {
972      if (isLoopIncrement(IVU, IV))
973        LoopIncs.push_back(cast<Instruction>(IVU));
974    }
975    findRootsRecursive(IV, SmallInstructionSet());
976    LoopIncs.push_back(IV);
977  } else {
978    if (!findRootsBase(IV, SmallInstructionSet()))
979      return false;
980  }
981
982  // Ensure all sets have the same size.
983  if (RootSets.empty()) {
984    LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
985    return false;
986  }
987  for (auto &V : RootSets) {
988    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
989      LLVM_DEBUG(
990          dbgs()
991          << "LRR: Aborting because not all root sets have the same size\n");
992      return false;
993    }
994  }
995
996  Scale = RootSets[0].Roots.size() + 1;
997
998  if (Scale > IL_MaxRerollIterations) {
999    LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
1000                      << "#Found=" << Scale
1001                      << ", #Max=" << IL_MaxRerollIterations << "\n");
1002    return false;
1003  }
1004
1005  LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
1006                    << "\n");
1007
1008  return true;
1009}
1010
1011bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
1012  // Populate the MapVector with all instructions in the block, in order first,
1013  // so we can iterate over the contents later in perfect order.
1014  for (auto &I : *L->getHeader()) {
1015    Uses[&I].resize(IL_End);
1016  }
1017
1018  SmallInstructionSet Exclude;
1019  for (auto &DRS : RootSets) {
1020    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
1021    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
1022    Exclude.insert(DRS.BaseInst);
1023  }
1024  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
1025
1026  for (auto &DRS : RootSets) {
1027    DenseSet<Instruction*> VBase;
1028    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
1029    for (auto *I : VBase) {
1030      Uses[I].set(0);
1031    }
1032
1033    unsigned Idx = 1;
1034    for (auto *Root : DRS.Roots) {
1035      DenseSet<Instruction*> V;
1036      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
1037
1038      // While we're here, check the use sets are the same size.
1039      if (V.size() != VBase.size()) {
1040        LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
1041        return false;
1042      }
1043
1044      for (auto *I : V) {
1045        Uses[I].set(Idx);
1046      }
1047      ++Idx;
1048    }
1049
1050    // Make sure our subsumed instructions are remembered too.
1051    for (auto *I : DRS.SubsumedInsts) {
1052      Uses[I].set(IL_All);
1053    }
1054  }
1055
1056  // Make sure the loop increments are also accounted for.
1057
1058  Exclude.clear();
1059  for (auto &DRS : RootSets) {
1060    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
1061    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
1062    Exclude.insert(DRS.BaseInst);
1063  }
1064
1065  DenseSet<Instruction*> V;
1066  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
1067  for (auto *I : V) {
1068    Uses[I].set(IL_All);
1069  }
1070
1071  return true;
1072}
1073
1074/// Get the next instruction in "In" that is a member of set Val.
1075/// Start searching from StartI, and do not return anything in Exclude.
1076/// If StartI is not given, start from In.begin().
1077LoopReroll::DAGRootTracker::UsesTy::iterator
1078LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
1079                                      const SmallInstructionSet &Exclude,
1080                                      UsesTy::iterator *StartI) {
1081  UsesTy::iterator I = StartI ? *StartI : In.begin();
1082  while (I != In.end() && (I->second.test(Val) == 0 ||
1083                           Exclude.count(I->first) != 0))
1084    ++I;
1085  return I;
1086}
1087
1088bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
1089  for (auto &DRS : RootSets) {
1090    if (DRS.BaseInst == I)
1091      return true;
1092  }
1093  return false;
1094}
1095
1096bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
1097  for (auto &DRS : RootSets) {
1098    if (is_contained(DRS.Roots, I))
1099      return true;
1100  }
1101  return false;
1102}
1103
1104/// Return true if instruction I depends on any instruction between
1105/// Start and End.
1106bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
1107                                                UsesTy::iterator Start,
1108                                                UsesTy::iterator End) {
1109  for (auto *U : I->users()) {
1110    for (auto It = Start; It != End; ++It)
1111      if (U == It->first)
1112        return true;
1113  }
1114  return false;
1115}
1116
1117static bool isIgnorableInst(const Instruction *I) {
1118  if (isa<DbgInfoIntrinsic>(I))
1119    return true;
1120  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
1121  if (!II)
1122    return false;
1123  switch (II->getIntrinsicID()) {
1124    default:
1125      return false;
1126    case Intrinsic::annotation:
1127    case Intrinsic::ptr_annotation:
1128    case Intrinsic::var_annotation:
1129    // TODO: the following intrinsics may also be whitelisted:
1130    //   lifetime_start, lifetime_end, invariant_start, invariant_end
1131      return true;
1132  }
1133  return false;
1134}
1135
1136bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
1137  // We now need to check for equivalence of the use graph of each root with
1138  // that of the primary induction variable (excluding the roots). Our goal
1139  // here is not to solve the full graph isomorphism problem, but rather to
1140  // catch common cases without a lot of work. As a result, we will assume
1141  // that the relative order of the instructions in each unrolled iteration
1142  // is the same (although we will not make an assumption about how the
1143  // different iterations are intermixed). Note that while the order must be
1144  // the same, the instructions may not be in the same basic block.
1145
1146  // An array of just the possible reductions for this scale factor. When we
1147  // collect the set of all users of some root instructions, these reduction
1148  // instructions are treated as 'final' (their uses are not considered).
1149  // This is important because we don't want the root use set to search down
1150  // the reduction chain.
1151  SmallInstructionSet PossibleRedSet;
1152  SmallInstructionSet PossibleRedLastSet;
1153  SmallInstructionSet PossibleRedPHISet;
1154  Reductions.restrictToScale(Scale, PossibleRedSet,
1155                             PossibleRedPHISet, PossibleRedLastSet);
1156
1157  // Populate "Uses" with where each instruction is used.
1158  if (!collectUsedInstructions(PossibleRedSet))
1159    return false;
1160
1161  // Make sure we mark the reduction PHIs as used in all iterations.
1162  for (auto *I : PossibleRedPHISet) {
1163    Uses[I].set(IL_All);
1164  }
1165
1166  // Make sure we mark loop-control-only PHIs as used in all iterations. See
1167  // comment above LoopReroll::isLoopControlIV for more information.
1168  BasicBlock *Header = L->getHeader();
1169  if (LoopControlIV && LoopControlIV != IV) {
1170    for (auto *U : LoopControlIV->users()) {
1171      Instruction *IVUser = dyn_cast<Instruction>(U);
1172      // IVUser could be loop increment or compare
1173      Uses[IVUser].set(IL_All);
1174      for (auto *UU : IVUser->users()) {
1175        Instruction *UUser = dyn_cast<Instruction>(UU);
1176        // UUser could be compare, PHI or branch
1177        Uses[UUser].set(IL_All);
1178        // Skip SExt
1179        if (isa<SExtInst>(UUser)) {
1180          UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
1181          Uses[UUser].set(IL_All);
1182        }
1183        // Is UUser a compare instruction?
1184        if (UU->hasOneUse()) {
1185          Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
1186          if (BI == cast<BranchInst>(Header->getTerminator()))
1187            Uses[BI].set(IL_All);
1188        }
1189      }
1190    }
1191  }
1192
1193  // Make sure all instructions in the loop are in one and only one
1194  // set.
1195  for (auto &KV : Uses) {
1196    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
1197      LLVM_DEBUG(
1198          dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
1199                 << *KV.first << " (#uses=" << KV.second.count() << ")\n");
1200      return false;
1201    }
1202  }
1203
1204  LLVM_DEBUG(for (auto &KV
1205                  : Uses) {
1206    dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
1207  });
1208
1209  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
1210    // In addition to regular aliasing information, we need to look for
1211    // instructions from later (future) iterations that have side effects
1212    // preventing us from reordering them past other instructions with side
1213    // effects.
1214    bool FutureSideEffects = false;
1215    AliasSetTracker AST(*AA);
1216    // The map between instructions in f(%iv.(i+1)) and f(%iv).
1217    DenseMap<Value *, Value *> BaseMap;
1218
1219    // Compare iteration Iter to the base.
1220    SmallInstructionSet Visited;
1221    auto BaseIt = nextInstr(0, Uses, Visited);
1222    auto RootIt = nextInstr(Iter, Uses, Visited);
1223    auto LastRootIt = Uses.begin();
1224
1225    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
1226      Instruction *BaseInst = BaseIt->first;
1227      Instruction *RootInst = RootIt->first;
1228
1229      // Skip over the IV or root instructions; only match their users.
1230      bool Continue = false;
1231      if (isBaseInst(BaseInst)) {
1232        Visited.insert(BaseInst);
1233        BaseIt = nextInstr(0, Uses, Visited);
1234        Continue = true;
1235      }
1236      if (isRootInst(RootInst)) {
1237        LastRootIt = RootIt;
1238        Visited.insert(RootInst);
1239        RootIt = nextInstr(Iter, Uses, Visited);
1240        Continue = true;
1241      }
1242      if (Continue) continue;
1243
1244      if (!BaseInst->isSameOperationAs(RootInst)) {
1245        // Last chance saloon. We don't try and solve the full isomorphism
1246        // problem, but try and at least catch the case where two instructions
1247        // *of different types* are round the wrong way. We won't be able to
1248        // efficiently tell, given two ADD instructions, which way around we
1249        // should match them, but given an ADD and a SUB, we can at least infer
1250        // which one is which.
1251        //
1252        // This should allow us to deal with a greater subset of the isomorphism
1253        // problem. It does however change a linear algorithm into a quadratic
1254        // one, so limit the number of probes we do.
1255        auto TryIt = RootIt;
1256        unsigned N = NumToleratedFailedMatches;
1257        while (TryIt != Uses.end() &&
1258               !BaseInst->isSameOperationAs(TryIt->first) &&
1259               N--) {
1260          ++TryIt;
1261          TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
1262        }
1263
1264        if (TryIt == Uses.end() || TryIt == RootIt ||
1265            instrDependsOn(TryIt->first, RootIt, TryIt)) {
1266          LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
1267                            << *BaseInst << " vs. " << *RootInst << "\n");
1268          return false;
1269        }
1270
1271        RootIt = TryIt;
1272        RootInst = TryIt->first;
1273      }
1274
1275      // All instructions between the last root and this root
1276      // may belong to some other iteration. If they belong to a
1277      // future iteration, then they're dangerous to alias with.
1278      //
1279      // Note that because we allow a limited amount of flexibility in the order
1280      // that we visit nodes, LastRootIt might be *before* RootIt, in which
1281      // case we've already checked this set of instructions so we shouldn't
1282      // do anything.
1283      for (; LastRootIt < RootIt; ++LastRootIt) {
1284        Instruction *I = LastRootIt->first;
1285        if (LastRootIt->second.find_first() < (int)Iter)
1286          continue;
1287        if (I->mayWriteToMemory())
1288          AST.add(I);
1289        // Note: This is specifically guarded by a check on isa<PHINode>,
1290        // which while a valid (somewhat arbitrary) micro-optimization, is
1291        // needed because otherwise isSafeToSpeculativelyExecute returns
1292        // false on PHI nodes.
1293        if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
1294            !isSafeToSpeculativelyExecute(I))
1295          // Intervening instructions cause side effects.
1296          FutureSideEffects = true;
1297      }
1298
1299      // Make sure that this instruction, which is in the use set of this
1300      // root instruction, does not also belong to the base set or the set of
1301      // some other root instruction.
1302      if (RootIt->second.count() > 1) {
1303        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
1304                          << " vs. " << *RootInst << " (prev. case overlap)\n");
1305        return false;
1306      }
1307
1308      // Make sure that we don't alias with any instruction in the alias set
1309      // tracker. If we do, then we depend on a future iteration, and we
1310      // can't reroll.
1311      if (RootInst->mayReadFromMemory())
1312        for (auto &K : AST) {
1313          if (K.aliasesUnknownInst(RootInst, *AA)) {
1314            LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
1315                              << *BaseInst << " vs. " << *RootInst
1316                              << " (depends on future store)\n");
1317            return false;
1318          }
1319        }
1320
1321      // If we've past an instruction from a future iteration that may have
1322      // side effects, and this instruction might also, then we can't reorder
1323      // them, and this matching fails. As an exception, we allow the alias
1324      // set tracker to handle regular (unordered) load/store dependencies.
1325      if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
1326                                 !isSafeToSpeculativelyExecute(BaseInst)) ||
1327                                (!isUnorderedLoadStore(RootInst) &&
1328                                 !isSafeToSpeculativelyExecute(RootInst)))) {
1329        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
1330                          << " vs. " << *RootInst
1331                          << " (side effects prevent reordering)\n");
1332        return false;
1333      }
1334
1335      // For instructions that are part of a reduction, if the operation is
1336      // associative, then don't bother matching the operands (because we
1337      // already know that the instructions are isomorphic, and the order
1338      // within the iteration does not matter). For non-associative reductions,
1339      // we do need to match the operands, because we need to reject
1340      // out-of-order instructions within an iteration!
1341      // For example (assume floating-point addition), we need to reject this:
1342      //   x += a[i]; x += b[i];
1343      //   x += a[i+1]; x += b[i+1];
1344      //   x += b[i+2]; x += a[i+2];
1345      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
1346
1347      if (!(InReduction && BaseInst->isAssociative())) {
1348        bool Swapped = false, SomeOpMatched = false;
1349        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
1350          Value *Op2 = RootInst->getOperand(j);
1351
1352          // If this is part of a reduction (and the operation is not
1353          // associatve), then we match all operands, but not those that are
1354          // part of the reduction.
1355          if (InReduction)
1356            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
1357              if (Reductions.isPairInSame(RootInst, Op2I))
1358                continue;
1359
1360          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
1361          if (BMI != BaseMap.end()) {
1362            Op2 = BMI->second;
1363          } else {
1364            for (auto &DRS : RootSets) {
1365              if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
1366                Op2 = DRS.BaseInst;
1367                break;
1368              }
1369            }
1370          }
1371
1372          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
1373            // If we've not already decided to swap the matched operands, and
1374            // we've not already matched our first operand (note that we could
1375            // have skipped matching the first operand because it is part of a
1376            // reduction above), and the instruction is commutative, then try
1377            // the swapped match.
1378            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
1379                BaseInst->getOperand(!j) == Op2) {
1380              Swapped = true;
1381            } else {
1382              LLVM_DEBUG(dbgs()
1383                         << "LRR: iteration root match failed at " << *BaseInst
1384                         << " vs. " << *RootInst << " (operand " << j << ")\n");
1385              return false;
1386            }
1387          }
1388
1389          SomeOpMatched = true;
1390        }
1391      }
1392
1393      if ((!PossibleRedLastSet.count(BaseInst) &&
1394           hasUsesOutsideLoop(BaseInst, L)) ||
1395          (!PossibleRedLastSet.count(RootInst) &&
1396           hasUsesOutsideLoop(RootInst, L))) {
1397        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
1398                          << " vs. " << *RootInst << " (uses outside loop)\n");
1399        return false;
1400      }
1401
1402      Reductions.recordPair(BaseInst, RootInst, Iter);
1403      BaseMap.insert(std::make_pair(RootInst, BaseInst));
1404
1405      LastRootIt = RootIt;
1406      Visited.insert(BaseInst);
1407      Visited.insert(RootInst);
1408      BaseIt = nextInstr(0, Uses, Visited);
1409      RootIt = nextInstr(Iter, Uses, Visited);
1410    }
1411    assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
1412           "Mismatched set sizes!");
1413  }
1414
1415  LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
1416                    << "\n");
1417
1418  return true;
1419}
1420
1421void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
1422  BasicBlock *Header = L->getHeader();
1423
1424  // Compute the start and increment for each BaseInst before we start erasing
1425  // instructions.
1426  SmallVector<const SCEV *, 8> StartExprs;
1427  SmallVector<const SCEV *, 8> IncrExprs;
1428  for (auto &DRS : RootSets) {
1429    const SCEVAddRecExpr *IVSCEV =
1430        cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
1431    StartExprs.push_back(IVSCEV->getStart());
1432    IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
1433  }
1434
1435  // Remove instructions associated with non-base iterations.
1436  for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
1437       J != JE;) {
1438    unsigned I = Uses[&*J].find_first();
1439    if (I > 0 && I < IL_All) {
1440      LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
1441      J++->eraseFromParent();
1442      continue;
1443    }
1444
1445    ++J;
1446  }
1447
1448  // Rewrite each BaseInst using SCEV.
1449  for (size_t i = 0, e = RootSets.size(); i != e; ++i)
1450    // Insert the new induction variable.
1451    replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
1452
1453  { // Limit the lifetime of SCEVExpander.
1454    BranchInst *BI = cast<BranchInst>(Header->getTerminator());
1455    const DataLayout &DL = Header->getModule()->getDataLayout();
1456    SCEVExpander Expander(*SE, DL, "reroll");
1457    auto Zero = SE->getZero(BackedgeTakenCount->getType());
1458    auto One = SE->getOne(BackedgeTakenCount->getType());
1459    auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
1460    Value *NewIV =
1461        Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
1462                               Header->getFirstNonPHIOrDbg());
1463    // FIXME: This arithmetic can overflow.
1464    auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
1465    auto ScaledTripCount = SE->getMulExpr(
1466        TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
1467    auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
1468    Value *TakenCount =
1469        Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
1470                               Header->getFirstNonPHIOrDbg());
1471    Value *Cond =
1472        new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
1473    BI->setCondition(Cond);
1474
1475    if (BI->getSuccessor(1) != Header)
1476      BI->swapSuccessors();
1477  }
1478
1479  SimplifyInstructionsInBlock(Header, TLI);
1480  DeleteDeadPHIs(Header, TLI);
1481}
1482
1483void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
1484                                           const SCEV *Start,
1485                                           const SCEV *IncrExpr) {
1486  BasicBlock *Header = L->getHeader();
1487  Instruction *Inst = DRS.BaseInst;
1488
1489  const SCEV *NewIVSCEV =
1490      SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
1491
1492  { // Limit the lifetime of SCEVExpander.
1493    const DataLayout &DL = Header->getModule()->getDataLayout();
1494    SCEVExpander Expander(*SE, DL, "reroll");
1495    Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
1496                                          Header->getFirstNonPHIOrDbg());
1497
1498    for (auto &KV : Uses)
1499      if (KV.second.find_first() == 0)
1500        KV.first->replaceUsesOfWith(Inst, NewIV);
1501  }
1502}
1503
1504// Validate the selected reductions. All iterations must have an isomorphic
1505// part of the reduction chain and, for non-associative reductions, the chain
1506// entries must appear in order.
1507bool LoopReroll::ReductionTracker::validateSelected() {
1508  // For a non-associative reduction, the chain entries must appear in order.
1509  for (int i : Reds) {
1510    int PrevIter = 0, BaseCount = 0, Count = 0;
1511    for (Instruction *J : PossibleReds[i]) {
1512      // Note that all instructions in the chain must have been found because
1513      // all instructions in the function must have been assigned to some
1514      // iteration.
1515      int Iter = PossibleRedIter[J];
1516      if (Iter != PrevIter && Iter != PrevIter + 1 &&
1517          !PossibleReds[i].getReducedValue()->isAssociative()) {
1518        LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
1519                          << J << "\n");
1520        return false;
1521      }
1522
1523      if (Iter != PrevIter) {
1524        if (Count != BaseCount) {
1525          LLVM_DEBUG(dbgs()
1526                     << "LRR: Iteration " << PrevIter << " reduction use count "
1527                     << Count << " is not equal to the base use count "
1528                     << BaseCount << "\n");
1529          return false;
1530        }
1531
1532        Count = 0;
1533      }
1534
1535      ++Count;
1536      if (Iter == 0)
1537        ++BaseCount;
1538
1539      PrevIter = Iter;
1540    }
1541  }
1542
1543  return true;
1544}
1545
1546// For all selected reductions, remove all parts except those in the first
1547// iteration (and the PHI). Replace outside uses of the reduced value with uses
1548// of the first-iteration reduced value (in other words, reroll the selected
1549// reductions).
1550void LoopReroll::ReductionTracker::replaceSelected() {
1551  // Fixup reductions to refer to the last instruction associated with the
1552  // first iteration (not the last).
1553  for (int i : Reds) {
1554    int j = 0;
1555    for (int e = PossibleReds[i].size(); j != e; ++j)
1556      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
1557        --j;
1558        break;
1559      }
1560
1561    // Replace users with the new end-of-chain value.
1562    SmallInstructionVector Users;
1563    for (User *U : PossibleReds[i].getReducedValue()->users()) {
1564      Users.push_back(cast<Instruction>(U));
1565    }
1566
1567    for (Instruction *User : Users)
1568      User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
1569                              PossibleReds[i][j]);
1570  }
1571}
1572
1573// Reroll the provided loop with respect to the provided induction variable.
1574// Generally, we're looking for a loop like this:
1575//
1576// %iv = phi [ (preheader, ...), (body, %iv.next) ]
1577// f(%iv)
1578// %iv.1 = add %iv, 1                <-- a root increment
1579// f(%iv.1)
1580// %iv.2 = add %iv, 2                <-- a root increment
1581// f(%iv.2)
1582// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
1583// f(%iv.scale_m_1)
1584// ...
1585// %iv.next = add %iv, scale
1586// %cmp = icmp(%iv, ...)
1587// br %cmp, header, exit
1588//
1589// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
1590// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
1591// be intermixed with eachother. The restriction imposed by this algorithm is
1592// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
1593// etc. be the same.
1594//
1595// First, we collect the use set of %iv, excluding the other increment roots.
1596// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
1597// times, having collected the use set of f(%iv.(i+1)), during which we:
1598//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
1599//     the next unmatched instruction in f(%iv.(i+1)).
1600//   - Ensure that both matched instructions don't have any external users
1601//     (with the exception of last-in-chain reduction instructions).
1602//   - Track the (aliasing) write set, and other side effects, of all
1603//     instructions that belong to future iterations that come before the matched
1604//     instructions. If the matched instructions read from that write set, then
1605//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
1606//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
1607//     if any of these future instructions had side effects (could not be
1608//     speculatively executed), and so do the matched instructions, when we
1609//     cannot reorder those side-effect-producing instructions, and rerolling
1610//     fails.
1611//
1612// Finally, we make sure that all loop instructions are either loop increment
1613// roots, belong to simple latch code, parts of validated reductions, part of
1614// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
1615// have been validated), then we reroll the loop.
1616bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
1617                        const SCEV *BackedgeTakenCount,
1618                        ReductionTracker &Reductions) {
1619  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
1620                          IVToIncMap, LoopControlIV);
1621
1622  if (!DAGRoots.findRoots())
1623    return false;
1624  LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
1625                    << "\n");
1626
1627  if (!DAGRoots.validate(Reductions))
1628    return false;
1629  if (!Reductions.validateSelected())
1630    return false;
1631  // At this point, we've validated the rerolling, and we're committed to
1632  // making changes!
1633
1634  Reductions.replaceSelected();
1635  DAGRoots.replace(BackedgeTakenCount);
1636
1637  ++NumRerolledLoops;
1638  return true;
1639}
1640
1641bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
1642  if (skipLoop(L))
1643    return false;
1644
1645  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1646  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1647  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1648  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
1649      *L->getHeader()->getParent());
1650  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1651  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1652
1653  BasicBlock *Header = L->getHeader();
1654  LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
1655                    << Header->getName() << " (" << L->getNumBlocks()
1656                    << " block(s))\n");
1657
1658  // For now, we'll handle only single BB loops.
1659  if (L->getNumBlocks() > 1)
1660    return false;
1661
1662  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
1663    return false;
1664
1665  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
1666  LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
1667  LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
1668               << "\n");
1669
1670  // First, we need to find the induction variable with respect to which we can
1671  // reroll (there may be several possible options).
1672  SmallInstructionVector PossibleIVs;
1673  IVToIncMap.clear();
1674  LoopControlIV = nullptr;
1675  collectPossibleIVs(L, PossibleIVs);
1676
1677  if (PossibleIVs.empty()) {
1678    LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
1679    return false;
1680  }
1681
1682  ReductionTracker Reductions;
1683  collectPossibleReductions(L, Reductions);
1684  bool Changed = false;
1685
1686  // For each possible IV, collect the associated possible set of 'root' nodes
1687  // (i+1, i+2, etc.).
1688  for (Instruction *PossibleIV : PossibleIVs)
1689    if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
1690      Changed = true;
1691      break;
1692    }
1693  LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
1694
1695  // Trip count of L has changed so SE must be re-evaluated.
1696  if (Changed)
1697    SE->forgetLoop(L);
1698
1699  return Changed;
1700}
1701