1//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop unroller.  It works best when loops have
10// been canonicalized by the -indvars pass, allowing it to determine the trip
11// counts of loops easily.
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/DenseMapInfo.h"
17#include "llvm/ADT/DenseSet.h"
18#include "llvm/ADT/None.h"
19#include "llvm/ADT/Optional.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include "llvm/Analysis/AssumptionCache.h"
26#include "llvm/Analysis/BlockFrequencyInfo.h"
27#include "llvm/Analysis/CodeMetrics.h"
28#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
29#include "llvm/Analysis/LoopAnalysisManager.h"
30#include "llvm/Analysis/LoopInfo.h"
31#include "llvm/Analysis/LoopPass.h"
32#include "llvm/Analysis/LoopUnrollAnalyzer.h"
33#include "llvm/Analysis/OptimizationRemarkEmitter.h"
34#include "llvm/Analysis/ProfileSummaryInfo.h"
35#include "llvm/Analysis/ScalarEvolution.h"
36#include "llvm/Analysis/TargetTransformInfo.h"
37#include "llvm/IR/BasicBlock.h"
38#include "llvm/IR/CFG.h"
39#include "llvm/IR/Constant.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DiagnosticInfo.h"
42#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Instructions.h"
46#include "llvm/IR/IntrinsicInst.h"
47#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
49#include "llvm/InitializePasses.h"
50#include "llvm/Pass.h"
51#include "llvm/Support/Casting.h"
52#include "llvm/Support/CommandLine.h"
53#include "llvm/Support/Debug.h"
54#include "llvm/Support/ErrorHandling.h"
55#include "llvm/Support/raw_ostream.h"
56#include "llvm/Transforms/Scalar.h"
57#include "llvm/Transforms/Scalar/LoopPassManager.h"
58#include "llvm/Transforms/Utils.h"
59#include "llvm/Transforms/Utils/LoopSimplify.h"
60#include "llvm/Transforms/Utils/LoopUtils.h"
61#include "llvm/Transforms/Utils/SizeOpts.h"
62#include "llvm/Transforms/Utils/UnrollLoop.h"
63#include <algorithm>
64#include <cassert>
65#include <cstdint>
66#include <limits>
67#include <string>
68#include <tuple>
69#include <utility>
70
71using namespace llvm;
72
73#define DEBUG_TYPE "loop-unroll"
74
75cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
76    "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
77    cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
78             " the current top-most loop. This is somtimes preferred to reduce"
79             " compile time."));
80
81static cl::opt<unsigned>
82    UnrollThreshold("unroll-threshold", cl::Hidden,
83                    cl::desc("The cost threshold for loop unrolling"));
84
85static cl::opt<unsigned> UnrollPartialThreshold(
86    "unroll-partial-threshold", cl::Hidden,
87    cl::desc("The cost threshold for partial loop unrolling"));
88
89static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
90    "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
91    cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
92             "to the threshold when aggressively unrolling a loop due to the "
93             "dynamic cost savings. If completely unrolling a loop will reduce "
94             "the total runtime from X to Y, we boost the loop unroll "
95             "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
96             "X/Y). This limit avoids excessive code bloat."));
97
98static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
99    "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
100    cl::desc("Don't allow loop unrolling to simulate more than this number of"
101             "iterations when checking full unroll profitability"));
102
103static cl::opt<unsigned> UnrollCount(
104    "unroll-count", cl::Hidden,
105    cl::desc("Use this unroll count for all loops including those with "
106             "unroll_count pragma values, for testing purposes"));
107
108static cl::opt<unsigned> UnrollMaxCount(
109    "unroll-max-count", cl::Hidden,
110    cl::desc("Set the max unroll count for partial and runtime unrolling, for"
111             "testing purposes"));
112
113static cl::opt<unsigned> UnrollFullMaxCount(
114    "unroll-full-max-count", cl::Hidden,
115    cl::desc(
116        "Set the max unroll count for full unrolling, for testing purposes"));
117
118static cl::opt<unsigned> UnrollPeelCount(
119    "unroll-peel-count", cl::Hidden,
120    cl::desc("Set the unroll peeling count, for testing purposes"));
121
122static cl::opt<bool>
123    UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
124                       cl::desc("Allows loops to be partially unrolled until "
125                                "-unroll-threshold loop size is reached."));
126
127static cl::opt<bool> UnrollAllowRemainder(
128    "unroll-allow-remainder", cl::Hidden,
129    cl::desc("Allow generation of a loop remainder (extra iterations) "
130             "when unrolling a loop."));
131
132static cl::opt<bool>
133    UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
134                  cl::desc("Unroll loops with run-time trip counts"));
135
136static cl::opt<unsigned> UnrollMaxUpperBound(
137    "unroll-max-upperbound", cl::init(8), cl::Hidden,
138    cl::desc(
139        "The max of trip count upper bound that is considered in unrolling"));
140
141static cl::opt<unsigned> PragmaUnrollThreshold(
142    "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
143    cl::desc("Unrolled size limit for loops with an unroll(full) or "
144             "unroll_count pragma."));
145
146static cl::opt<unsigned> FlatLoopTripCountThreshold(
147    "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
148    cl::desc("If the runtime tripcount for the loop is lower than the "
149             "threshold, the loop is considered as flat and will be less "
150             "aggressively unrolled."));
151
152static cl::opt<bool>
153    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
154                       cl::desc("Allows loops to be peeled when the dynamic "
155                                "trip count is known to be low."));
156
157static cl::opt<bool> UnrollUnrollRemainder(
158  "unroll-remainder", cl::Hidden,
159  cl::desc("Allow the loop remainder to be unrolled."));
160
161// This option isn't ever intended to be enabled, it serves to allow
162// experiments to check the assumptions about when this kind of revisit is
163// necessary.
164static cl::opt<bool> UnrollRevisitChildLoops(
165    "unroll-revisit-child-loops", cl::Hidden,
166    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
167             "This shouldn't typically be needed as child loops (or their "
168             "clones) were already visited."));
169
170/// A magic value for use with the Threshold parameter to indicate
171/// that the loop unroll should be performed regardless of how much
172/// code expansion would result.
173static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
174
175/// Gather the various unrolling parameters based on the defaults, compiler
176/// flags, TTI overrides and user specified parameters.
177TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
178    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
179    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
180    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
181    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
182    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
183    Optional<bool> UserAllowProfileBasedPeeling,
184    Optional<unsigned> UserFullUnrollMaxCount) {
185  TargetTransformInfo::UnrollingPreferences UP;
186
187  // Set up the defaults
188  UP.Threshold = OptLevel > 2 ? 300 : 150;
189  UP.MaxPercentThresholdBoost = 400;
190  UP.OptSizeThreshold = 0;
191  UP.PartialThreshold = 150;
192  UP.PartialOptSizeThreshold = 0;
193  UP.Count = 0;
194  UP.PeelCount = 0;
195  UP.DefaultUnrollRuntimeCount = 8;
196  UP.MaxCount = std::numeric_limits<unsigned>::max();
197  UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
198  UP.BEInsns = 2;
199  UP.Partial = false;
200  UP.Runtime = false;
201  UP.AllowRemainder = true;
202  UP.UnrollRemainder = false;
203  UP.AllowExpensiveTripCount = false;
204  UP.Force = false;
205  UP.UpperBound = false;
206  UP.AllowPeeling = true;
207  UP.UnrollAndJam = false;
208  UP.PeelProfiledIterations = true;
209  UP.UnrollAndJamInnerLoopThreshold = 60;
210
211  // Override with any target specific settings
212  TTI.getUnrollingPreferences(L, SE, UP);
213
214  // Apply size attributes
215  bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
216                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
217                                                PGSOQueryType::IRPass);
218  if (OptForSize) {
219    UP.Threshold = UP.OptSizeThreshold;
220    UP.PartialThreshold = UP.PartialOptSizeThreshold;
221    UP.MaxPercentThresholdBoost = 100;
222  }
223
224  // Apply any user values specified by cl::opt
225  if (UnrollThreshold.getNumOccurrences() > 0)
226    UP.Threshold = UnrollThreshold;
227  if (UnrollPartialThreshold.getNumOccurrences() > 0)
228    UP.PartialThreshold = UnrollPartialThreshold;
229  if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
230    UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
231  if (UnrollMaxCount.getNumOccurrences() > 0)
232    UP.MaxCount = UnrollMaxCount;
233  if (UnrollFullMaxCount.getNumOccurrences() > 0)
234    UP.FullUnrollMaxCount = UnrollFullMaxCount;
235  if (UnrollPeelCount.getNumOccurrences() > 0)
236    UP.PeelCount = UnrollPeelCount;
237  if (UnrollAllowPartial.getNumOccurrences() > 0)
238    UP.Partial = UnrollAllowPartial;
239  if (UnrollAllowRemainder.getNumOccurrences() > 0)
240    UP.AllowRemainder = UnrollAllowRemainder;
241  if (UnrollRuntime.getNumOccurrences() > 0)
242    UP.Runtime = UnrollRuntime;
243  if (UnrollMaxUpperBound == 0)
244    UP.UpperBound = false;
245  if (UnrollAllowPeeling.getNumOccurrences() > 0)
246    UP.AllowPeeling = UnrollAllowPeeling;
247  if (UnrollUnrollRemainder.getNumOccurrences() > 0)
248    UP.UnrollRemainder = UnrollUnrollRemainder;
249
250  // Apply user values provided by argument
251  if (UserThreshold.hasValue()) {
252    UP.Threshold = *UserThreshold;
253    UP.PartialThreshold = *UserThreshold;
254  }
255  if (UserCount.hasValue())
256    UP.Count = *UserCount;
257  if (UserAllowPartial.hasValue())
258    UP.Partial = *UserAllowPartial;
259  if (UserRuntime.hasValue())
260    UP.Runtime = *UserRuntime;
261  if (UserUpperBound.hasValue())
262    UP.UpperBound = *UserUpperBound;
263  if (UserAllowPeeling.hasValue())
264    UP.AllowPeeling = *UserAllowPeeling;
265  if (UserAllowProfileBasedPeeling.hasValue())
266    UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
267  if (UserFullUnrollMaxCount.hasValue())
268    UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
269
270  return UP;
271}
272
273namespace {
274
275/// A struct to densely store the state of an instruction after unrolling at
276/// each iteration.
277///
278/// This is designed to work like a tuple of <Instruction *, int> for the
279/// purposes of hashing and lookup, but to be able to associate two boolean
280/// states with each key.
281struct UnrolledInstState {
282  Instruction *I;
283  int Iteration : 30;
284  unsigned IsFree : 1;
285  unsigned IsCounted : 1;
286};
287
288/// Hashing and equality testing for a set of the instruction states.
289struct UnrolledInstStateKeyInfo {
290  using PtrInfo = DenseMapInfo<Instruction *>;
291  using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
292
293  static inline UnrolledInstState getEmptyKey() {
294    return {PtrInfo::getEmptyKey(), 0, 0, 0};
295  }
296
297  static inline UnrolledInstState getTombstoneKey() {
298    return {PtrInfo::getTombstoneKey(), 0, 0, 0};
299  }
300
301  static inline unsigned getHashValue(const UnrolledInstState &S) {
302    return PairInfo::getHashValue({S.I, S.Iteration});
303  }
304
305  static inline bool isEqual(const UnrolledInstState &LHS,
306                             const UnrolledInstState &RHS) {
307    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
308  }
309};
310
311struct EstimatedUnrollCost {
312  /// The estimated cost after unrolling.
313  unsigned UnrolledCost;
314
315  /// The estimated dynamic cost of executing the instructions in the
316  /// rolled form.
317  unsigned RolledDynamicCost;
318};
319
320} // end anonymous namespace
321
322/// Figure out if the loop is worth full unrolling.
323///
324/// Complete loop unrolling can make some loads constant, and we need to know
325/// if that would expose any further optimization opportunities.  This routine
326/// estimates this optimization.  It computes cost of unrolled loop
327/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
328/// dynamic cost we mean that we won't count costs of blocks that are known not
329/// to be executed (i.e. if we have a branch in the loop and we know that at the
330/// given iteration its condition would be resolved to true, we won't add up the
331/// cost of the 'false'-block).
332/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
333/// the analysis failed (no benefits expected from the unrolling, or the loop is
334/// too big to analyze), the returned value is None.
335static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
336    const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
337    const SmallPtrSetImpl<const Value *> &EphValues,
338    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
339  // We want to be able to scale offsets by the trip count and add more offsets
340  // to them without checking for overflows, and we already don't want to
341  // analyze *massive* trip counts, so we force the max to be reasonably small.
342  assert(UnrollMaxIterationsCountToAnalyze <
343             (unsigned)(std::numeric_limits<int>::max() / 2) &&
344         "The unroll iterations max is too large!");
345
346  // Only analyze inner loops. We can't properly estimate cost of nested loops
347  // and we won't visit inner loops again anyway.
348  if (!L->empty())
349    return None;
350
351  // Don't simulate loops with a big or unknown tripcount
352  if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
353      TripCount > UnrollMaxIterationsCountToAnalyze)
354    return None;
355
356  SmallSetVector<BasicBlock *, 16> BBWorklist;
357  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
358  DenseMap<Value *, Constant *> SimplifiedValues;
359  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
360
361  // The estimated cost of the unrolled form of the loop. We try to estimate
362  // this by simplifying as much as we can while computing the estimate.
363  unsigned UnrolledCost = 0;
364
365  // We also track the estimated dynamic (that is, actually executed) cost in
366  // the rolled form. This helps identify cases when the savings from unrolling
367  // aren't just exposing dead control flows, but actual reduced dynamic
368  // instructions due to the simplifications which we expect to occur after
369  // unrolling.
370  unsigned RolledDynamicCost = 0;
371
372  // We track the simplification of each instruction in each iteration. We use
373  // this to recursively merge costs into the unrolled cost on-demand so that
374  // we don't count the cost of any dead code. This is essentially a map from
375  // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
376  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
377
378  // A small worklist used to accumulate cost of instructions from each
379  // observable and reached root in the loop.
380  SmallVector<Instruction *, 16> CostWorklist;
381
382  // PHI-used worklist used between iterations while accumulating cost.
383  SmallVector<Instruction *, 4> PHIUsedList;
384
385  // Helper function to accumulate cost for instructions in the loop.
386  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
387    assert(Iteration >= 0 && "Cannot have a negative iteration!");
388    assert(CostWorklist.empty() && "Must start with an empty cost list");
389    assert(PHIUsedList.empty() && "Must start with an empty phi used list");
390    CostWorklist.push_back(&RootI);
391    for (;; --Iteration) {
392      do {
393        Instruction *I = CostWorklist.pop_back_val();
394
395        // InstCostMap only uses I and Iteration as a key, the other two values
396        // don't matter here.
397        auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
398        if (CostIter == InstCostMap.end())
399          // If an input to a PHI node comes from a dead path through the loop
400          // we may have no cost data for it here. What that actually means is
401          // that it is free.
402          continue;
403        auto &Cost = *CostIter;
404        if (Cost.IsCounted)
405          // Already counted this instruction.
406          continue;
407
408        // Mark that we are counting the cost of this instruction now.
409        Cost.IsCounted = true;
410
411        // If this is a PHI node in the loop header, just add it to the PHI set.
412        if (auto *PhiI = dyn_cast<PHINode>(I))
413          if (PhiI->getParent() == L->getHeader()) {
414            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
415                                  "inherently simplify during unrolling.");
416            if (Iteration == 0)
417              continue;
418
419            // Push the incoming value from the backedge into the PHI used list
420            // if it is an in-loop instruction. We'll use this to populate the
421            // cost worklist for the next iteration (as we count backwards).
422            if (auto *OpI = dyn_cast<Instruction>(
423                    PhiI->getIncomingValueForBlock(L->getLoopLatch())))
424              if (L->contains(OpI))
425                PHIUsedList.push_back(OpI);
426            continue;
427          }
428
429        // First accumulate the cost of this instruction.
430        if (!Cost.IsFree) {
431          UnrolledCost += TTI.getUserCost(I);
432          LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
433                            << Iteration << "): ");
434          LLVM_DEBUG(I->dump());
435        }
436
437        // We must count the cost of every operand which is not free,
438        // recursively. If we reach a loop PHI node, simply add it to the set
439        // to be considered on the next iteration (backwards!).
440        for (Value *Op : I->operands()) {
441          // Check whether this operand is free due to being a constant or
442          // outside the loop.
443          auto *OpI = dyn_cast<Instruction>(Op);
444          if (!OpI || !L->contains(OpI))
445            continue;
446
447          // Otherwise accumulate its cost.
448          CostWorklist.push_back(OpI);
449        }
450      } while (!CostWorklist.empty());
451
452      if (PHIUsedList.empty())
453        // We've exhausted the search.
454        break;
455
456      assert(Iteration > 0 &&
457             "Cannot track PHI-used values past the first iteration!");
458      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
459      PHIUsedList.clear();
460    }
461  };
462
463  // Ensure that we don't violate the loop structure invariants relied on by
464  // this analysis.
465  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
466  assert(L->isLCSSAForm(DT) &&
467         "Must have loops in LCSSA form to track live-out values.");
468
469  LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
470
471  // Simulate execution of each iteration of the loop counting instructions,
472  // which would be simplified.
473  // Since the same load will take different values on different iterations,
474  // we literally have to go through all loop's iterations.
475  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
476    LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
477
478    // Prepare for the iteration by collecting any simplified entry or backedge
479    // inputs.
480    for (Instruction &I : *L->getHeader()) {
481      auto *PHI = dyn_cast<PHINode>(&I);
482      if (!PHI)
483        break;
484
485      // The loop header PHI nodes must have exactly two input: one from the
486      // loop preheader and one from the loop latch.
487      assert(
488          PHI->getNumIncomingValues() == 2 &&
489          "Must have an incoming value only for the preheader and the latch.");
490
491      Value *V = PHI->getIncomingValueForBlock(
492          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
493      Constant *C = dyn_cast<Constant>(V);
494      if (Iteration != 0 && !C)
495        C = SimplifiedValues.lookup(V);
496      if (C)
497        SimplifiedInputValues.push_back({PHI, C});
498    }
499
500    // Now clear and re-populate the map for the next iteration.
501    SimplifiedValues.clear();
502    while (!SimplifiedInputValues.empty())
503      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
504
505    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
506
507    BBWorklist.clear();
508    BBWorklist.insert(L->getHeader());
509    // Note that we *must not* cache the size, this loop grows the worklist.
510    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
511      BasicBlock *BB = BBWorklist[Idx];
512
513      // Visit all instructions in the given basic block and try to simplify
514      // it.  We don't change the actual IR, just count optimization
515      // opportunities.
516      for (Instruction &I : *BB) {
517        // These won't get into the final code - don't even try calculating the
518        // cost for them.
519        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
520          continue;
521
522        // Track this instruction's expected baseline cost when executing the
523        // rolled loop form.
524        RolledDynamicCost += TTI.getUserCost(&I);
525
526        // Visit the instruction to analyze its loop cost after unrolling,
527        // and if the visitor returns true, mark the instruction as free after
528        // unrolling and continue.
529        bool IsFree = Analyzer.visit(I);
530        bool Inserted = InstCostMap.insert({&I, (int)Iteration,
531                                           (unsigned)IsFree,
532                                           /*IsCounted*/ false}).second;
533        (void)Inserted;
534        assert(Inserted && "Cannot have a state for an unvisited instruction!");
535
536        if (IsFree)
537          continue;
538
539        // Can't properly model a cost of a call.
540        // FIXME: With a proper cost model we should be able to do it.
541        if (auto *CI = dyn_cast<CallInst>(&I)) {
542          const Function *Callee = CI->getCalledFunction();
543          if (!Callee || TTI.isLoweredToCall(Callee)) {
544            LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
545            return None;
546          }
547        }
548
549        // If the instruction might have a side-effect recursively account for
550        // the cost of it and all the instructions leading up to it.
551        if (I.mayHaveSideEffects())
552          AddCostRecursively(I, Iteration);
553
554        // If unrolled body turns out to be too big, bail out.
555        if (UnrolledCost > MaxUnrolledLoopSize) {
556          LLVM_DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
557                            << "  UnrolledCost: " << UnrolledCost
558                            << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
559                            << "\n");
560          return None;
561        }
562      }
563
564      Instruction *TI = BB->getTerminator();
565
566      // Add in the live successors by first checking whether we have terminator
567      // that may be simplified based on the values simplified by this call.
568      BasicBlock *KnownSucc = nullptr;
569      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
570        if (BI->isConditional()) {
571          if (Constant *SimpleCond =
572                  SimplifiedValues.lookup(BI->getCondition())) {
573            // Just take the first successor if condition is undef
574            if (isa<UndefValue>(SimpleCond))
575              KnownSucc = BI->getSuccessor(0);
576            else if (ConstantInt *SimpleCondVal =
577                         dyn_cast<ConstantInt>(SimpleCond))
578              KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
579          }
580        }
581      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
582        if (Constant *SimpleCond =
583                SimplifiedValues.lookup(SI->getCondition())) {
584          // Just take the first successor if condition is undef
585          if (isa<UndefValue>(SimpleCond))
586            KnownSucc = SI->getSuccessor(0);
587          else if (ConstantInt *SimpleCondVal =
588                       dyn_cast<ConstantInt>(SimpleCond))
589            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
590        }
591      }
592      if (KnownSucc) {
593        if (L->contains(KnownSucc))
594          BBWorklist.insert(KnownSucc);
595        else
596          ExitWorklist.insert({BB, KnownSucc});
597        continue;
598      }
599
600      // Add BB's successors to the worklist.
601      for (BasicBlock *Succ : successors(BB))
602        if (L->contains(Succ))
603          BBWorklist.insert(Succ);
604        else
605          ExitWorklist.insert({BB, Succ});
606      AddCostRecursively(*TI, Iteration);
607    }
608
609    // If we found no optimization opportunities on the first iteration, we
610    // won't find them on later ones too.
611    if (UnrolledCost == RolledDynamicCost) {
612      LLVM_DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
613                        << "  UnrolledCost: " << UnrolledCost << "\n");
614      return None;
615    }
616  }
617
618  while (!ExitWorklist.empty()) {
619    BasicBlock *ExitingBB, *ExitBB;
620    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
621
622    for (Instruction &I : *ExitBB) {
623      auto *PN = dyn_cast<PHINode>(&I);
624      if (!PN)
625        break;
626
627      Value *Op = PN->getIncomingValueForBlock(ExitingBB);
628      if (auto *OpI = dyn_cast<Instruction>(Op))
629        if (L->contains(OpI))
630          AddCostRecursively(*OpI, TripCount - 1);
631    }
632  }
633
634  LLVM_DEBUG(dbgs() << "Analysis finished:\n"
635                    << "UnrolledCost: " << UnrolledCost << ", "
636                    << "RolledDynamicCost: " << RolledDynamicCost << "\n");
637  return {{UnrolledCost, RolledDynamicCost}};
638}
639
640/// ApproximateLoopSize - Approximate the size of the loop.
641unsigned llvm::ApproximateLoopSize(
642    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
643    const TargetTransformInfo &TTI,
644    const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
645  CodeMetrics Metrics;
646  for (BasicBlock *BB : L->blocks())
647    Metrics.analyzeBasicBlock(BB, TTI, EphValues);
648  NumCalls = Metrics.NumInlineCandidates;
649  NotDuplicatable = Metrics.notDuplicatable;
650  Convergent = Metrics.convergent;
651
652  unsigned LoopSize = Metrics.NumInsts;
653
654  // Don't allow an estimate of size zero.  This would allows unrolling of loops
655  // with huge iteration counts, which is a compile time problem even if it's
656  // not a problem for code quality. Also, the code using this size may assume
657  // that each loop has at least three instructions (likely a conditional
658  // branch, a comparison feeding that branch, and some kind of loop increment
659  // feeding that comparison instruction).
660  LoopSize = std::max(LoopSize, BEInsns + 1);
661
662  return LoopSize;
663}
664
665// Returns the loop hint metadata node with the given name (for example,
666// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
667// returned.
668static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
669  if (MDNode *LoopID = L->getLoopID())
670    return GetUnrollMetadata(LoopID, Name);
671  return nullptr;
672}
673
674// Returns true if the loop has an unroll(full) pragma.
675static bool HasUnrollFullPragma(const Loop *L) {
676  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
677}
678
679// Returns true if the loop has an unroll(enable) pragma. This metadata is used
680// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
681static bool HasUnrollEnablePragma(const Loop *L) {
682  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
683}
684
685// Returns true if the loop has an runtime unroll(disable) pragma.
686static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
687  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
688}
689
690// If loop has an unroll_count pragma return the (necessarily
691// positive) value from the pragma.  Otherwise return 0.
692static unsigned UnrollCountPragmaValue(const Loop *L) {
693  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
694  if (MD) {
695    assert(MD->getNumOperands() == 2 &&
696           "Unroll count hint metadata should have two operands.");
697    unsigned Count =
698        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
699    assert(Count >= 1 && "Unroll count must be positive.");
700    return Count;
701  }
702  return 0;
703}
704
705// Computes the boosting factor for complete unrolling.
706// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
707// be beneficial to fully unroll the loop even if unrolledcost is large. We
708// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
709// the unroll threshold.
710static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
711                                            unsigned MaxPercentThresholdBoost) {
712  if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
713    return 100;
714  else if (Cost.UnrolledCost != 0)
715    // The boosting factor is RolledDynamicCost / UnrolledCost
716    return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
717                    MaxPercentThresholdBoost);
718  else
719    return MaxPercentThresholdBoost;
720}
721
722// Returns loop size estimation for unrolled loop.
723static uint64_t getUnrolledLoopSize(
724    unsigned LoopSize,
725    TargetTransformInfo::UnrollingPreferences &UP) {
726  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
727  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
728}
729
730// Returns true if unroll count was set explicitly.
731// Calculates unroll count and writes it to UP.Count.
732// Unless IgnoreUser is true, will also use metadata and command-line options
733// that are specific to to the LoopUnroll pass (which, for instance, are
734// irrelevant for the LoopUnrollAndJam pass).
735// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
736// many LoopUnroll-specific options. The shared functionality should be
737// refactored into it own function.
738bool llvm::computeUnrollCount(
739    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
740    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
741    OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
742    bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
743    TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
744
745  // Check for explicit Count.
746  // 1st priority is unroll count set by "unroll-count" option.
747  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
748  if (UserUnrollCount) {
749    UP.Count = UnrollCount;
750    UP.AllowExpensiveTripCount = true;
751    UP.Force = true;
752    if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
753      return true;
754  }
755
756  // 2nd priority is unroll count set by pragma.
757  unsigned PragmaCount = UnrollCountPragmaValue(L);
758  if (PragmaCount > 0) {
759    UP.Count = PragmaCount;
760    UP.Runtime = true;
761    UP.AllowExpensiveTripCount = true;
762    UP.Force = true;
763    if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
764        getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
765      return true;
766  }
767  bool PragmaFullUnroll = HasUnrollFullPragma(L);
768  if (PragmaFullUnroll && TripCount != 0) {
769    UP.Count = TripCount;
770    if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
771      return false;
772  }
773
774  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
775  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
776                        PragmaEnableUnroll || UserUnrollCount;
777
778  if (ExplicitUnroll && TripCount != 0) {
779    // If the loop has an unrolling pragma, we want to be more aggressive with
780    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
781    // value which is larger than the default limits.
782    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
783    UP.PartialThreshold =
784        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
785  }
786
787  // 3rd priority is full unroll count.
788  // Full unroll makes sense only when TripCount or its upper bound could be
789  // statically calculated.
790  // Also we need to check if we exceed FullUnrollMaxCount.
791  // If using the upper bound to unroll, TripMultiple should be set to 1 because
792  // we do not know when loop may exit.
793
794  // We can unroll by the upper bound amount if it's generally allowed or if
795  // we know that the loop is executed either the upper bound or zero times.
796  // (MaxOrZero unrolling keeps only the first loop test, so the number of
797  // loop tests remains the same compared to the non-unrolled version, whereas
798  // the generic upper bound unrolling keeps all but the last loop test so the
799  // number of loop tests goes up which may end up being worse on targets with
800  // constrained branch predictor resources so is controlled by an option.)
801  // In addition we only unroll small upper bounds.
802  unsigned FullUnrollMaxTripCount = MaxTripCount;
803  if (!(UP.UpperBound || MaxOrZero) ||
804      FullUnrollMaxTripCount > UnrollMaxUpperBound)
805    FullUnrollMaxTripCount = 0;
806
807  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
808  // compute the former when the latter is zero.
809  unsigned ExactTripCount = TripCount;
810  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
811         "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
812
813  unsigned FullUnrollTripCount =
814      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
815  UP.Count = FullUnrollTripCount;
816  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
817    // When computing the unrolled size, note that BEInsns are not replicated
818    // like the rest of the loop body.
819    if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
820      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
821      TripCount = FullUnrollTripCount;
822      TripMultiple = UP.UpperBound ? 1 : TripMultiple;
823      return ExplicitUnroll;
824    } else {
825      // The loop isn't that small, but we still can fully unroll it if that
826      // helps to remove a significant number of instructions.
827      // To check that, run additional analysis on the loop.
828      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
829              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
830              UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
831        unsigned Boost =
832            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
833        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
834          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
835          TripCount = FullUnrollTripCount;
836          TripMultiple = UP.UpperBound ? 1 : TripMultiple;
837          return ExplicitUnroll;
838        }
839      }
840    }
841  }
842
843  // 4th priority is loop peeling.
844  computePeelCount(L, LoopSize, UP, TripCount, SE);
845  if (UP.PeelCount) {
846    UP.Runtime = false;
847    UP.Count = 1;
848    return ExplicitUnroll;
849  }
850
851  // 5th priority is partial unrolling.
852  // Try partial unroll only when TripCount could be statically calculated.
853  if (TripCount) {
854    UP.Partial |= ExplicitUnroll;
855    if (!UP.Partial) {
856      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
857                        << "-unroll-allow-partial not given\n");
858      UP.Count = 0;
859      return false;
860    }
861    if (UP.Count == 0)
862      UP.Count = TripCount;
863    if (UP.PartialThreshold != NoThreshold) {
864      // Reduce unroll count to be modulo of TripCount for partial unrolling.
865      if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
866        UP.Count =
867            (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
868            (LoopSize - UP.BEInsns);
869      if (UP.Count > UP.MaxCount)
870        UP.Count = UP.MaxCount;
871      while (UP.Count != 0 && TripCount % UP.Count != 0)
872        UP.Count--;
873      if (UP.AllowRemainder && UP.Count <= 1) {
874        // If there is no Count that is modulo of TripCount, set Count to
875        // largest power-of-two factor that satisfies the threshold limit.
876        // As we'll create fixup loop, do the type of unrolling only if
877        // remainder loop is allowed.
878        UP.Count = UP.DefaultUnrollRuntimeCount;
879        while (UP.Count != 0 &&
880               getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
881          UP.Count >>= 1;
882      }
883      if (UP.Count < 2) {
884        if (PragmaEnableUnroll)
885          ORE->emit([&]() {
886            return OptimizationRemarkMissed(DEBUG_TYPE,
887                                            "UnrollAsDirectedTooLarge",
888                                            L->getStartLoc(), L->getHeader())
889                   << "Unable to unroll loop as directed by unroll(enable) "
890                      "pragma "
891                      "because unrolled size is too large.";
892          });
893        UP.Count = 0;
894      }
895    } else {
896      UP.Count = TripCount;
897    }
898    if (UP.Count > UP.MaxCount)
899      UP.Count = UP.MaxCount;
900    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
901        UP.Count != TripCount)
902      ORE->emit([&]() {
903        return OptimizationRemarkMissed(DEBUG_TYPE,
904                                        "FullUnrollAsDirectedTooLarge",
905                                        L->getStartLoc(), L->getHeader())
906               << "Unable to fully unroll loop as directed by unroll pragma "
907                  "because "
908                  "unrolled size is too large.";
909      });
910    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
911                      << "\n");
912    return ExplicitUnroll;
913  }
914  assert(TripCount == 0 &&
915         "All cases when TripCount is constant should be covered here.");
916  if (PragmaFullUnroll)
917    ORE->emit([&]() {
918      return OptimizationRemarkMissed(
919                 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
920                 L->getStartLoc(), L->getHeader())
921             << "Unable to fully unroll loop as directed by unroll(full) "
922                "pragma "
923                "because loop has a runtime trip count.";
924    });
925
926  // 6th priority is runtime unrolling.
927  // Don't unroll a runtime trip count loop when it is disabled.
928  if (HasRuntimeUnrollDisablePragma(L)) {
929    UP.Count = 0;
930    return false;
931  }
932
933  // Don't unroll a small upper bound loop unless user or TTI asked to do so.
934  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
935    UP.Count = 0;
936    return false;
937  }
938
939  // Check if the runtime trip count is too small when profile is available.
940  if (L->getHeader()->getParent()->hasProfileData()) {
941    if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
942      if (*ProfileTripCount < FlatLoopTripCountThreshold)
943        return false;
944      else
945        UP.AllowExpensiveTripCount = true;
946    }
947  }
948
949  // Reduce count based on the type of unrolling and the threshold values.
950  UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
951  if (!UP.Runtime) {
952    LLVM_DEBUG(
953        dbgs() << "  will not try to unroll loop with runtime trip count "
954               << "-unroll-runtime not given\n");
955    UP.Count = 0;
956    return false;
957  }
958  if (UP.Count == 0)
959    UP.Count = UP.DefaultUnrollRuntimeCount;
960
961  // Reduce unroll count to be the largest power-of-two factor of
962  // the original count which satisfies the threshold limit.
963  while (UP.Count != 0 &&
964         getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
965    UP.Count >>= 1;
966
967#ifndef NDEBUG
968  unsigned OrigCount = UP.Count;
969#endif
970
971  if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
972    while (UP.Count != 0 && TripMultiple % UP.Count != 0)
973      UP.Count >>= 1;
974    LLVM_DEBUG(
975        dbgs() << "Remainder loop is restricted (that could architecture "
976                  "specific or because the loop contains a convergent "
977                  "instruction), so unroll count must divide the trip "
978                  "multiple, "
979               << TripMultiple << ".  Reducing unroll count from " << OrigCount
980               << " to " << UP.Count << ".\n");
981
982    using namespace ore;
983
984    if (PragmaCount > 0 && !UP.AllowRemainder)
985      ORE->emit([&]() {
986        return OptimizationRemarkMissed(DEBUG_TYPE,
987                                        "DifferentUnrollCountFromDirected",
988                                        L->getStartLoc(), L->getHeader())
989               << "Unable to unroll loop the number of times directed by "
990                  "unroll_count pragma because remainder loop is restricted "
991                  "(that could architecture specific or because the loop "
992                  "contains a convergent instruction) and so must have an "
993                  "unroll "
994                  "count that divides the loop trip multiple of "
995               << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
996               << NV("UnrollCount", UP.Count) << " time(s).";
997      });
998  }
999
1000  if (UP.Count > UP.MaxCount)
1001    UP.Count = UP.MaxCount;
1002
1003  if (MaxTripCount && UP.Count > MaxTripCount)
1004    UP.Count = MaxTripCount;
1005
1006  LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
1007                    << "\n");
1008  if (UP.Count < 2)
1009    UP.Count = 0;
1010  return ExplicitUnroll;
1011}
1012
1013static LoopUnrollResult tryToUnrollLoop(
1014    Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
1015    const TargetTransformInfo &TTI, AssumptionCache &AC,
1016    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
1017    ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1018    bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
1019    Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
1020    Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
1021    Optional<bool> ProvidedAllowPeeling,
1022    Optional<bool> ProvidedAllowProfileBasedPeeling,
1023    Optional<unsigned> ProvidedFullUnrollMaxCount) {
1024  LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1025                    << L->getHeader()->getParent()->getName() << "] Loop %"
1026                    << L->getHeader()->getName() << "\n");
1027  TransformationMode TM = hasUnrollTransformation(L);
1028  if (TM & TM_Disable)
1029    return LoopUnrollResult::Unmodified;
1030  if (!L->isLoopSimplifyForm()) {
1031    LLVM_DEBUG(
1032        dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
1033    return LoopUnrollResult::Unmodified;
1034  }
1035
1036  // When automtatic unrolling is disabled, do not unroll unless overridden for
1037  // this loop.
1038  if (OnlyWhenForced && !(TM & TM_Enable))
1039    return LoopUnrollResult::Unmodified;
1040
1041  bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1042  unsigned NumInlineCandidates;
1043  bool NotDuplicatable;
1044  bool Convergent;
1045  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
1046      L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
1047      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1048      ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1049      ProvidedFullUnrollMaxCount);
1050
1051  // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1052  // as threshold later on.
1053  if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1054      !OptForSize)
1055    return LoopUnrollResult::Unmodified;
1056
1057  SmallPtrSet<const Value *, 32> EphValues;
1058  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1059
1060  unsigned LoopSize =
1061      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
1062                          TTI, EphValues, UP.BEInsns);
1063  LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
1064  if (NotDuplicatable) {
1065    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
1066                      << " instructions.\n");
1067    return LoopUnrollResult::Unmodified;
1068  }
1069
1070  // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1071  // later), to (fully) unroll loops, if it does not increase code size.
1072  if (OptForSize)
1073    UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1074
1075  if (NumInlineCandidates != 0) {
1076    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
1077    return LoopUnrollResult::Unmodified;
1078  }
1079
1080  // Find trip count and trip multiple if count is not available
1081  unsigned TripCount = 0;
1082  unsigned TripMultiple = 1;
1083  // If there are multiple exiting blocks but one of them is the latch, use the
1084  // latch for the trip count estimation. Otherwise insist on a single exiting
1085  // block for the trip count estimation.
1086  BasicBlock *ExitingBlock = L->getLoopLatch();
1087  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1088    ExitingBlock = L->getExitingBlock();
1089  if (ExitingBlock) {
1090    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
1091    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1092  }
1093
1094  // If the loop contains a convergent operation, the prelude we'd add
1095  // to do the first few instructions before we hit the unrolled loop
1096  // is unsafe -- it adds a control-flow dependency to the convergent
1097  // operation.  Therefore restrict remainder loop (try unrollig without).
1098  //
1099  // TODO: This is quite conservative.  In practice, convergent_op()
1100  // is likely to be called unconditionally in the loop.  In this
1101  // case, the program would be ill-formed (on most architectures)
1102  // unless n were the same on all threads in a thread group.
1103  // Assuming n is the same on all threads, any kind of unrolling is
1104  // safe.  But currently llvm's notion of convergence isn't powerful
1105  // enough to express this.
1106  if (Convergent)
1107    UP.AllowRemainder = false;
1108
1109  // Try to find the trip count upper bound if we cannot find the exact trip
1110  // count.
1111  unsigned MaxTripCount = 0;
1112  bool MaxOrZero = false;
1113  if (!TripCount) {
1114    MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1115    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1116  }
1117
1118  // computeUnrollCount() decides whether it is beneficial to use upper bound to
1119  // fully unroll the loop.
1120  bool UseUpperBound = false;
1121  bool IsCountSetExplicitly = computeUnrollCount(
1122      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
1123      TripMultiple, LoopSize, UP, UseUpperBound);
1124  if (!UP.Count)
1125    return LoopUnrollResult::Unmodified;
1126  // Unroll factor (Count) must be less or equal to TripCount.
1127  if (TripCount && UP.Count > TripCount)
1128    UP.Count = TripCount;
1129
1130  // Save loop properties before it is transformed.
1131  MDNode *OrigLoopID = L->getLoopID();
1132
1133  // Unroll the loop.
1134  Loop *RemainderLoop = nullptr;
1135  LoopUnrollResult UnrollResult = UnrollLoop(
1136      L,
1137      {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
1138       UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
1139       ForgetAllSCEV},
1140      LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop);
1141  if (UnrollResult == LoopUnrollResult::Unmodified)
1142    return LoopUnrollResult::Unmodified;
1143
1144  if (RemainderLoop) {
1145    Optional<MDNode *> RemainderLoopID =
1146        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1147                                        LLVMLoopUnrollFollowupRemainder});
1148    if (RemainderLoopID.hasValue())
1149      RemainderLoop->setLoopID(RemainderLoopID.getValue());
1150  }
1151
1152  if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1153    Optional<MDNode *> NewLoopID =
1154        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1155                                        LLVMLoopUnrollFollowupUnrolled});
1156    if (NewLoopID.hasValue()) {
1157      L->setLoopID(NewLoopID.getValue());
1158
1159      // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1160      // explicitly.
1161      return UnrollResult;
1162    }
1163  }
1164
1165  // If loop has an unroll count pragma or unrolled by explicitly set count
1166  // mark loop as unrolled to prevent unrolling beyond that requested.
1167  // If the loop was peeled, we already "used up" the profile information
1168  // we had, so we don't want to unroll or peel again.
1169  if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
1170      (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount)))
1171    L->setLoopAlreadyUnrolled();
1172
1173  return UnrollResult;
1174}
1175
1176namespace {
1177
1178class LoopUnroll : public LoopPass {
1179public:
1180  static char ID; // Pass ID, replacement for typeid
1181
1182  int OptLevel;
1183
1184  /// If false, use a cost model to determine whether unrolling of a loop is
1185  /// profitable. If true, only loops that explicitly request unrolling via
1186  /// metadata are considered. All other loops are skipped.
1187  bool OnlyWhenForced;
1188
1189  /// If false, when SCEV is invalidated, only forget everything in the
1190  /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1191  /// Otherwise, forgetAllLoops and rebuild when needed next.
1192  bool ForgetAllSCEV;
1193
1194  Optional<unsigned> ProvidedCount;
1195  Optional<unsigned> ProvidedThreshold;
1196  Optional<bool> ProvidedAllowPartial;
1197  Optional<bool> ProvidedRuntime;
1198  Optional<bool> ProvidedUpperBound;
1199  Optional<bool> ProvidedAllowPeeling;
1200  Optional<bool> ProvidedAllowProfileBasedPeeling;
1201  Optional<unsigned> ProvidedFullUnrollMaxCount;
1202
1203  LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1204             bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
1205             Optional<unsigned> Count = None,
1206             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
1207             Optional<bool> UpperBound = None,
1208             Optional<bool> AllowPeeling = None,
1209             Optional<bool> AllowProfileBasedPeeling = None,
1210             Optional<unsigned> ProvidedFullUnrollMaxCount = None)
1211      : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1212        ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1213        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1214        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1215        ProvidedAllowPeeling(AllowPeeling),
1216        ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1217        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1218    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
1219  }
1220
1221  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1222    if (skipLoop(L))
1223      return false;
1224
1225    Function &F = *L->getHeader()->getParent();
1226
1227    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1228    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1229    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1230    const TargetTransformInfo &TTI =
1231        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1232    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1233    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1234    // pass.  Function analyses need to be preserved across loop transformations
1235    // but ORE cannot be preserved (see comment before the pass definition).
1236    OptimizationRemarkEmitter ORE(&F);
1237    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1238
1239    LoopUnrollResult Result = tryToUnrollLoop(
1240        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1241        OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
1242        ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1243        ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1244        ProvidedFullUnrollMaxCount);
1245
1246    if (Result == LoopUnrollResult::FullyUnrolled)
1247      LPM.markLoopAsDeleted(*L);
1248
1249    return Result != LoopUnrollResult::Unmodified;
1250  }
1251
1252  /// This transformation requires natural loop information & requires that
1253  /// loop preheaders be inserted into the CFG...
1254  void getAnalysisUsage(AnalysisUsage &AU) const override {
1255    AU.addRequired<AssumptionCacheTracker>();
1256    AU.addRequired<TargetTransformInfoWrapperPass>();
1257    // FIXME: Loop passes are required to preserve domtree, and for now we just
1258    // recreate dom info if anything gets unrolled.
1259    getLoopAnalysisUsage(AU);
1260  }
1261};
1262
1263} // end anonymous namespace
1264
1265char LoopUnroll::ID = 0;
1266
1267INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1268INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1269INITIALIZE_PASS_DEPENDENCY(LoopPass)
1270INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1271INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1272
1273Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1274                                 bool ForgetAllSCEV, int Threshold, int Count,
1275                                 int AllowPartial, int Runtime, int UpperBound,
1276                                 int AllowPeeling) {
1277  // TODO: It would make more sense for this function to take the optionals
1278  // directly, but that's dangerous since it would silently break out of tree
1279  // callers.
1280  return new LoopUnroll(
1281      OptLevel, OnlyWhenForced, ForgetAllSCEV,
1282      Threshold == -1 ? None : Optional<unsigned>(Threshold),
1283      Count == -1 ? None : Optional<unsigned>(Count),
1284      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
1285      Runtime == -1 ? None : Optional<bool>(Runtime),
1286      UpperBound == -1 ? None : Optional<bool>(UpperBound),
1287      AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
1288}
1289
1290Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1291                                       bool ForgetAllSCEV) {
1292  return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
1293                              0, 0, 0, 0);
1294}
1295
1296PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
1297                                          LoopStandardAnalysisResults &AR,
1298                                          LPMUpdater &Updater) {
1299  const auto &FAM =
1300      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
1301  Function *F = L.getHeader()->getParent();
1302
1303  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
1304  // FIXME: This should probably be optional rather than required.
1305  if (!ORE)
1306    report_fatal_error(
1307        "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
1308        "cached at a higher level");
1309
1310  // Keep track of the previous loop structure so we can identify new loops
1311  // created by unrolling.
1312  Loop *ParentL = L.getParentLoop();
1313  SmallPtrSet<Loop *, 4> OldLoops;
1314  if (ParentL)
1315    OldLoops.insert(ParentL->begin(), ParentL->end());
1316  else
1317    OldLoops.insert(AR.LI.begin(), AR.LI.end());
1318
1319  std::string LoopName = L.getName();
1320
1321  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
1322                                 /*BFI*/ nullptr, /*PSI*/ nullptr,
1323                                 /*PreserveLCSSA*/ true, OptLevel,
1324                                 OnlyWhenForced, ForgetSCEV, /*Count*/ None,
1325                                 /*Threshold*/ None, /*AllowPartial*/ false,
1326                                 /*Runtime*/ false, /*UpperBound*/ false,
1327                                 /*AllowPeeling*/ false,
1328                                 /*AllowProfileBasedPeeling*/ false,
1329                                 /*FullUnrollMaxCount*/ None) !=
1330                 LoopUnrollResult::Unmodified;
1331  if (!Changed)
1332    return PreservedAnalyses::all();
1333
1334  // The parent must not be damaged by unrolling!
1335#ifndef NDEBUG
1336  if (ParentL)
1337    ParentL->verifyLoop();
1338#endif
1339
1340  // Unrolling can do several things to introduce new loops into a loop nest:
1341  // - Full unrolling clones child loops within the current loop but then
1342  //   removes the current loop making all of the children appear to be new
1343  //   sibling loops.
1344  //
1345  // When a new loop appears as a sibling loop after fully unrolling,
1346  // its nesting structure has fundamentally changed and we want to revisit
1347  // it to reflect that.
1348  //
1349  // When unrolling has removed the current loop, we need to tell the
1350  // infrastructure that it is gone.
1351  //
1352  // Finally, we support a debugging/testing mode where we revisit child loops
1353  // as well. These are not expected to require further optimizations as either
1354  // they or the loop they were cloned from have been directly visited already.
1355  // But the debugging mode allows us to check this assumption.
1356  bool IsCurrentLoopValid = false;
1357  SmallVector<Loop *, 4> SibLoops;
1358  if (ParentL)
1359    SibLoops.append(ParentL->begin(), ParentL->end());
1360  else
1361    SibLoops.append(AR.LI.begin(), AR.LI.end());
1362  erase_if(SibLoops, [&](Loop *SibLoop) {
1363    if (SibLoop == &L) {
1364      IsCurrentLoopValid = true;
1365      return true;
1366    }
1367
1368    // Otherwise erase the loop from the list if it was in the old loops.
1369    return OldLoops.count(SibLoop) != 0;
1370  });
1371  Updater.addSiblingLoops(SibLoops);
1372
1373  if (!IsCurrentLoopValid) {
1374    Updater.markLoopAsDeleted(L, LoopName);
1375  } else {
1376    // We can only walk child loops if the current loop remained valid.
1377    if (UnrollRevisitChildLoops) {
1378      // Walk *all* of the child loops.
1379      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1380      Updater.addChildLoops(ChildLoops);
1381    }
1382  }
1383
1384  return getLoopPassPreservedAnalyses();
1385}
1386
1387template <typename RangeT>
1388static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) {
1389  SmallVector<Loop *, 8> Worklist;
1390  // We use an internal worklist to build up the preorder traversal without
1391  // recursion.
1392  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
1393
1394  for (Loop *RootL : Loops) {
1395    assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
1396    assert(PreOrderWorklist.empty() &&
1397           "Must start with an empty preorder walk worklist.");
1398    PreOrderWorklist.push_back(RootL);
1399    do {
1400      Loop *L = PreOrderWorklist.pop_back_val();
1401      PreOrderWorklist.append(L->begin(), L->end());
1402      PreOrderLoops.push_back(L);
1403    } while (!PreOrderWorklist.empty());
1404
1405    Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
1406    PreOrderLoops.clear();
1407  }
1408  return Worklist;
1409}
1410
1411PreservedAnalyses LoopUnrollPass::run(Function &F,
1412                                      FunctionAnalysisManager &AM) {
1413  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1414  auto &LI = AM.getResult<LoopAnalysis>(F);
1415  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1416  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1417  auto &AC = AM.getResult<AssumptionAnalysis>(F);
1418  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1419
1420  LoopAnalysisManager *LAM = nullptr;
1421  if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1422    LAM = &LAMProxy->getManager();
1423
1424  const ModuleAnalysisManager &MAM =
1425      AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
1426  ProfileSummaryInfo *PSI =
1427      MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1428  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1429      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1430
1431  bool Changed = false;
1432
1433  // The unroller requires loops to be in simplified form, and also needs LCSSA.
1434  // Since simplification may add new inner loops, it has to run before the
1435  // legality and profitability checks. This means running the loop unroller
1436  // will simplify all loops, regardless of whether anything end up being
1437  // unrolled.
1438  for (auto &L : LI) {
1439    Changed |=
1440        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1441    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1442  }
1443
1444  SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI);
1445
1446  while (!Worklist.empty()) {
1447    // Because the LoopInfo stores the loops in RPO, we walk the worklist
1448    // from back to front so that we work forward across the CFG, which
1449    // for unrolling is only needed to get optimization remarks emitted in
1450    // a forward order.
1451    Loop &L = *Worklist.pop_back_val();
1452#ifndef NDEBUG
1453    Loop *ParentL = L.getParentLoop();
1454#endif
1455
1456    // Check if the profile summary indicates that the profiled application
1457    // has a huge working set size, in which case we disable peeling to avoid
1458    // bloating it further.
1459    Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1460    if (PSI && PSI->hasHugeWorkingSetSize())
1461      LocalAllowPeeling = false;
1462    std::string LoopName = L.getName();
1463    // The API here is quite complex to call and we allow to select some
1464    // flavors of unrolling during construction time (by setting UnrollOpts).
1465    LoopUnrollResult Result = tryToUnrollLoop(
1466        &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1467        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
1468        UnrollOpts.ForgetSCEV, /*Count*/ None,
1469        /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
1470        UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1471        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
1472    Changed |= Result != LoopUnrollResult::Unmodified;
1473
1474    // The parent must not be damaged by unrolling!
1475#ifndef NDEBUG
1476    if (Result != LoopUnrollResult::Unmodified && ParentL)
1477      ParentL->verifyLoop();
1478#endif
1479
1480    // Clear any cached analysis results for L if we removed it completely.
1481    if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1482      LAM->clear(L, LoopName);
1483  }
1484
1485  if (!Changed)
1486    return PreservedAnalyses::all();
1487
1488  return getLoopPassPreservedAnalyses();
1489}
1490