1//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop unroller.  It works best when loops have
10// been canonicalized by the -indvars pass, allowing it to determine the trip
11// counts of loops easily.
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/DenseMapInfo.h"
17#include "llvm/ADT/DenseSet.h"
18#include "llvm/ADT/None.h"
19#include "llvm/ADT/Optional.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include "llvm/Analysis/AssumptionCache.h"
26#include "llvm/Analysis/BlockFrequencyInfo.h"
27#include "llvm/Analysis/CodeMetrics.h"
28#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
29#include "llvm/Analysis/LoopAnalysisManager.h"
30#include "llvm/Analysis/LoopInfo.h"
31#include "llvm/Analysis/LoopPass.h"
32#include "llvm/Analysis/LoopUnrollAnalyzer.h"
33#include "llvm/Analysis/OptimizationRemarkEmitter.h"
34#include "llvm/Analysis/ProfileSummaryInfo.h"
35#include "llvm/Analysis/ScalarEvolution.h"
36#include "llvm/Analysis/TargetTransformInfo.h"
37#include "llvm/IR/BasicBlock.h"
38#include "llvm/IR/CFG.h"
39#include "llvm/IR/Constant.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DiagnosticInfo.h"
42#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Instructions.h"
46#include "llvm/IR/IntrinsicInst.h"
47#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
49#include "llvm/InitializePasses.h"
50#include "llvm/Pass.h"
51#include "llvm/Support/Casting.h"
52#include "llvm/Support/CommandLine.h"
53#include "llvm/Support/Debug.h"
54#include "llvm/Support/ErrorHandling.h"
55#include "llvm/Support/raw_ostream.h"
56#include "llvm/Transforms/Scalar.h"
57#include "llvm/Transforms/Scalar/LoopPassManager.h"
58#include "llvm/Transforms/Utils.h"
59#include "llvm/Transforms/Utils/LoopSimplify.h"
60#include "llvm/Transforms/Utils/LoopUtils.h"
61#include "llvm/Transforms/Utils/SizeOpts.h"
62#include "llvm/Transforms/Utils/UnrollLoop.h"
63#include <algorithm>
64#include <cassert>
65#include <cstdint>
66#include <limits>
67#include <string>
68#include <tuple>
69#include <utility>
70
71using namespace llvm;
72
73#define DEBUG_TYPE "loop-unroll"
74
75cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
76    "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
77    cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
78             " the current top-most loop. This is somtimes preferred to reduce"
79             " compile time."));
80
81static cl::opt<unsigned>
82    UnrollThreshold("unroll-threshold", cl::Hidden,
83                    cl::desc("The cost threshold for loop unrolling"));
84
85static cl::opt<unsigned> UnrollPartialThreshold(
86    "unroll-partial-threshold", cl::Hidden,
87    cl::desc("The cost threshold for partial loop unrolling"));
88
89static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
90    "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
91    cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
92             "to the threshold when aggressively unrolling a loop due to the "
93             "dynamic cost savings. If completely unrolling a loop will reduce "
94             "the total runtime from X to Y, we boost the loop unroll "
95             "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
96             "X/Y). This limit avoids excessive code bloat."));
97
98static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
99    "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
100    cl::desc("Don't allow loop unrolling to simulate more than this number of"
101             "iterations when checking full unroll profitability"));
102
103static cl::opt<unsigned> UnrollCount(
104    "unroll-count", cl::Hidden,
105    cl::desc("Use this unroll count for all loops including those with "
106             "unroll_count pragma values, for testing purposes"));
107
108static cl::opt<unsigned> UnrollMaxCount(
109    "unroll-max-count", cl::Hidden,
110    cl::desc("Set the max unroll count for partial and runtime unrolling, for"
111             "testing purposes"));
112
113static cl::opt<unsigned> UnrollFullMaxCount(
114    "unroll-full-max-count", cl::Hidden,
115    cl::desc(
116        "Set the max unroll count for full unrolling, for testing purposes"));
117
118static cl::opt<unsigned> UnrollPeelCount(
119    "unroll-peel-count", cl::Hidden,
120    cl::desc("Set the unroll peeling count, for testing purposes"));
121
122static cl::opt<bool>
123    UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
124                       cl::desc("Allows loops to be partially unrolled until "
125                                "-unroll-threshold loop size is reached."));
126
127static cl::opt<bool> UnrollAllowRemainder(
128    "unroll-allow-remainder", cl::Hidden,
129    cl::desc("Allow generation of a loop remainder (extra iterations) "
130             "when unrolling a loop."));
131
132static cl::opt<bool>
133    UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
134                  cl::desc("Unroll loops with run-time trip counts"));
135
136static cl::opt<unsigned> UnrollMaxUpperBound(
137    "unroll-max-upperbound", cl::init(8), cl::Hidden,
138    cl::desc(
139        "The max of trip count upper bound that is considered in unrolling"));
140
141static cl::opt<unsigned> PragmaUnrollThreshold(
142    "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
143    cl::desc("Unrolled size limit for loops with an unroll(full) or "
144             "unroll_count pragma."));
145
146static cl::opt<unsigned> FlatLoopTripCountThreshold(
147    "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
148    cl::desc("If the runtime tripcount for the loop is lower than the "
149             "threshold, the loop is considered as flat and will be less "
150             "aggressively unrolled."));
151
152static cl::opt<bool>
153    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
154                       cl::desc("Allows loops to be peeled when the dynamic "
155                                "trip count is known to be low."));
156
157static cl::opt<bool> UnrollAllowLoopNestsPeeling(
158    "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden,
159    cl::desc("Allows loop nests to be peeled."));
160
161static cl::opt<bool> UnrollUnrollRemainder(
162  "unroll-remainder", cl::Hidden,
163  cl::desc("Allow the loop remainder to be unrolled."));
164
165// This option isn't ever intended to be enabled, it serves to allow
166// experiments to check the assumptions about when this kind of revisit is
167// necessary.
168static cl::opt<bool> UnrollRevisitChildLoops(
169    "unroll-revisit-child-loops", cl::Hidden,
170    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
171             "This shouldn't typically be needed as child loops (or their "
172             "clones) were already visited."));
173
174static cl::opt<unsigned> UnrollThresholdAggressive(
175    "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
176    cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
177             "optimizations"));
178static cl::opt<unsigned>
179    UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
180                           cl::Hidden,
181                           cl::desc("Default threshold (max size of unrolled "
182                                    "loop), used in all but O3 optimizations"));
183
184/// A magic value for use with the Threshold parameter to indicate
185/// that the loop unroll should be performed regardless of how much
186/// code expansion would result.
187static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
188
189/// Gather the various unrolling parameters based on the defaults, compiler
190/// flags, TTI overrides and user specified parameters.
191TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
192    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
193    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
194    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
195    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
196    Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) {
197  TargetTransformInfo::UnrollingPreferences UP;
198
199  // Set up the defaults
200  UP.Threshold =
201      OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
202  UP.MaxPercentThresholdBoost = 400;
203  UP.OptSizeThreshold = 0;
204  UP.PartialThreshold = 150;
205  UP.PartialOptSizeThreshold = 0;
206  UP.Count = 0;
207  UP.DefaultUnrollRuntimeCount = 8;
208  UP.MaxCount = std::numeric_limits<unsigned>::max();
209  UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
210  UP.BEInsns = 2;
211  UP.Partial = false;
212  UP.Runtime = false;
213  UP.AllowRemainder = true;
214  UP.UnrollRemainder = false;
215  UP.AllowExpensiveTripCount = false;
216  UP.Force = false;
217  UP.UpperBound = false;
218  UP.UnrollAndJam = false;
219  UP.UnrollAndJamInnerLoopThreshold = 60;
220  UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
221
222  // Override with any target specific settings
223  TTI.getUnrollingPreferences(L, SE, UP);
224
225  // Apply size attributes
226  bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
227                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
228                                                PGSOQueryType::IRPass);
229  if (OptForSize) {
230    UP.Threshold = UP.OptSizeThreshold;
231    UP.PartialThreshold = UP.PartialOptSizeThreshold;
232    UP.MaxPercentThresholdBoost = 100;
233  }
234
235  // Apply any user values specified by cl::opt
236  if (UnrollThreshold.getNumOccurrences() > 0)
237    UP.Threshold = UnrollThreshold;
238  if (UnrollPartialThreshold.getNumOccurrences() > 0)
239    UP.PartialThreshold = UnrollPartialThreshold;
240  if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
241    UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
242  if (UnrollMaxCount.getNumOccurrences() > 0)
243    UP.MaxCount = UnrollMaxCount;
244  if (UnrollFullMaxCount.getNumOccurrences() > 0)
245    UP.FullUnrollMaxCount = UnrollFullMaxCount;
246  if (UnrollAllowPartial.getNumOccurrences() > 0)
247    UP.Partial = UnrollAllowPartial;
248  if (UnrollAllowRemainder.getNumOccurrences() > 0)
249    UP.AllowRemainder = UnrollAllowRemainder;
250  if (UnrollRuntime.getNumOccurrences() > 0)
251    UP.Runtime = UnrollRuntime;
252  if (UnrollMaxUpperBound == 0)
253    UP.UpperBound = false;
254  if (UnrollUnrollRemainder.getNumOccurrences() > 0)
255    UP.UnrollRemainder = UnrollUnrollRemainder;
256  if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
257    UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
258
259  // Apply user values provided by argument
260  if (UserThreshold.hasValue()) {
261    UP.Threshold = *UserThreshold;
262    UP.PartialThreshold = *UserThreshold;
263  }
264  if (UserCount.hasValue())
265    UP.Count = *UserCount;
266  if (UserAllowPartial.hasValue())
267    UP.Partial = *UserAllowPartial;
268  if (UserRuntime.hasValue())
269    UP.Runtime = *UserRuntime;
270  if (UserUpperBound.hasValue())
271    UP.UpperBound = *UserUpperBound;
272  if (UserFullUnrollMaxCount.hasValue())
273    UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
274
275  return UP;
276}
277
278TargetTransformInfo::PeelingPreferences
279llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
280                               const TargetTransformInfo &TTI,
281                               Optional<bool> UserAllowPeeling,
282                               Optional<bool> UserAllowProfileBasedPeeling) {
283  TargetTransformInfo::PeelingPreferences PP;
284
285  // Default values
286  PP.PeelCount = 0;
287  PP.AllowPeeling = true;
288  PP.AllowLoopNestsPeeling = false;
289  PP.PeelProfiledIterations = true;
290
291  // Get Target Specifc Values
292  TTI.getPeelingPreferences(L, SE, PP);
293
294  // User Specified Values using cl::opt
295  if (UnrollPeelCount.getNumOccurrences() > 0)
296    PP.PeelCount = UnrollPeelCount;
297  if (UnrollAllowPeeling.getNumOccurrences() > 0)
298    PP.AllowPeeling = UnrollAllowPeeling;
299  if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
300    PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
301
302  // User Specifed values provided by argument
303  if (UserAllowPeeling.hasValue())
304    PP.AllowPeeling = *UserAllowPeeling;
305  if (UserAllowProfileBasedPeeling.hasValue())
306    PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
307
308  return PP;
309}
310
311namespace {
312
313/// A struct to densely store the state of an instruction after unrolling at
314/// each iteration.
315///
316/// This is designed to work like a tuple of <Instruction *, int> for the
317/// purposes of hashing and lookup, but to be able to associate two boolean
318/// states with each key.
319struct UnrolledInstState {
320  Instruction *I;
321  int Iteration : 30;
322  unsigned IsFree : 1;
323  unsigned IsCounted : 1;
324};
325
326/// Hashing and equality testing for a set of the instruction states.
327struct UnrolledInstStateKeyInfo {
328  using PtrInfo = DenseMapInfo<Instruction *>;
329  using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
330
331  static inline UnrolledInstState getEmptyKey() {
332    return {PtrInfo::getEmptyKey(), 0, 0, 0};
333  }
334
335  static inline UnrolledInstState getTombstoneKey() {
336    return {PtrInfo::getTombstoneKey(), 0, 0, 0};
337  }
338
339  static inline unsigned getHashValue(const UnrolledInstState &S) {
340    return PairInfo::getHashValue({S.I, S.Iteration});
341  }
342
343  static inline bool isEqual(const UnrolledInstState &LHS,
344                             const UnrolledInstState &RHS) {
345    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
346  }
347};
348
349struct EstimatedUnrollCost {
350  /// The estimated cost after unrolling.
351  unsigned UnrolledCost;
352
353  /// The estimated dynamic cost of executing the instructions in the
354  /// rolled form.
355  unsigned RolledDynamicCost;
356};
357
358} // end anonymous namespace
359
360/// Figure out if the loop is worth full unrolling.
361///
362/// Complete loop unrolling can make some loads constant, and we need to know
363/// if that would expose any further optimization opportunities.  This routine
364/// estimates this optimization.  It computes cost of unrolled loop
365/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
366/// dynamic cost we mean that we won't count costs of blocks that are known not
367/// to be executed (i.e. if we have a branch in the loop and we know that at the
368/// given iteration its condition would be resolved to true, we won't add up the
369/// cost of the 'false'-block).
370/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
371/// the analysis failed (no benefits expected from the unrolling, or the loop is
372/// too big to analyze), the returned value is None.
373static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
374    const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
375    const SmallPtrSetImpl<const Value *> &EphValues,
376    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
377    unsigned MaxIterationsCountToAnalyze) {
378  // We want to be able to scale offsets by the trip count and add more offsets
379  // to them without checking for overflows, and we already don't want to
380  // analyze *massive* trip counts, so we force the max to be reasonably small.
381  assert(MaxIterationsCountToAnalyze <
382             (unsigned)(std::numeric_limits<int>::max() / 2) &&
383         "The unroll iterations max is too large!");
384
385  // Only analyze inner loops. We can't properly estimate cost of nested loops
386  // and we won't visit inner loops again anyway.
387  if (!L->empty())
388    return None;
389
390  // Don't simulate loops with a big or unknown tripcount
391  if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
392    return None;
393
394  SmallSetVector<BasicBlock *, 16> BBWorklist;
395  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
396  DenseMap<Value *, Constant *> SimplifiedValues;
397  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
398
399  // The estimated cost of the unrolled form of the loop. We try to estimate
400  // this by simplifying as much as we can while computing the estimate.
401  unsigned UnrolledCost = 0;
402
403  // We also track the estimated dynamic (that is, actually executed) cost in
404  // the rolled form. This helps identify cases when the savings from unrolling
405  // aren't just exposing dead control flows, but actual reduced dynamic
406  // instructions due to the simplifications which we expect to occur after
407  // unrolling.
408  unsigned RolledDynamicCost = 0;
409
410  // We track the simplification of each instruction in each iteration. We use
411  // this to recursively merge costs into the unrolled cost on-demand so that
412  // we don't count the cost of any dead code. This is essentially a map from
413  // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
414  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
415
416  // A small worklist used to accumulate cost of instructions from each
417  // observable and reached root in the loop.
418  SmallVector<Instruction *, 16> CostWorklist;
419
420  // PHI-used worklist used between iterations while accumulating cost.
421  SmallVector<Instruction *, 4> PHIUsedList;
422
423  // Helper function to accumulate cost for instructions in the loop.
424  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
425    assert(Iteration >= 0 && "Cannot have a negative iteration!");
426    assert(CostWorklist.empty() && "Must start with an empty cost list");
427    assert(PHIUsedList.empty() && "Must start with an empty phi used list");
428    CostWorklist.push_back(&RootI);
429    for (;; --Iteration) {
430      do {
431        Instruction *I = CostWorklist.pop_back_val();
432
433        // InstCostMap only uses I and Iteration as a key, the other two values
434        // don't matter here.
435        auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
436        if (CostIter == InstCostMap.end())
437          // If an input to a PHI node comes from a dead path through the loop
438          // we may have no cost data for it here. What that actually means is
439          // that it is free.
440          continue;
441        auto &Cost = *CostIter;
442        if (Cost.IsCounted)
443          // Already counted this instruction.
444          continue;
445
446        // Mark that we are counting the cost of this instruction now.
447        Cost.IsCounted = true;
448
449        // If this is a PHI node in the loop header, just add it to the PHI set.
450        if (auto *PhiI = dyn_cast<PHINode>(I))
451          if (PhiI->getParent() == L->getHeader()) {
452            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
453                                  "inherently simplify during unrolling.");
454            if (Iteration == 0)
455              continue;
456
457            // Push the incoming value from the backedge into the PHI used list
458            // if it is an in-loop instruction. We'll use this to populate the
459            // cost worklist for the next iteration (as we count backwards).
460            if (auto *OpI = dyn_cast<Instruction>(
461                    PhiI->getIncomingValueForBlock(L->getLoopLatch())))
462              if (L->contains(OpI))
463                PHIUsedList.push_back(OpI);
464            continue;
465          }
466
467        // First accumulate the cost of this instruction.
468        if (!Cost.IsFree) {
469          UnrolledCost += TTI.getUserCost(I, TargetTransformInfo::TCK_CodeSize);
470          LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
471                            << Iteration << "): ");
472          LLVM_DEBUG(I->dump());
473        }
474
475        // We must count the cost of every operand which is not free,
476        // recursively. If we reach a loop PHI node, simply add it to the set
477        // to be considered on the next iteration (backwards!).
478        for (Value *Op : I->operands()) {
479          // Check whether this operand is free due to being a constant or
480          // outside the loop.
481          auto *OpI = dyn_cast<Instruction>(Op);
482          if (!OpI || !L->contains(OpI))
483            continue;
484
485          // Otherwise accumulate its cost.
486          CostWorklist.push_back(OpI);
487        }
488      } while (!CostWorklist.empty());
489
490      if (PHIUsedList.empty())
491        // We've exhausted the search.
492        break;
493
494      assert(Iteration > 0 &&
495             "Cannot track PHI-used values past the first iteration!");
496      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
497      PHIUsedList.clear();
498    }
499  };
500
501  // Ensure that we don't violate the loop structure invariants relied on by
502  // this analysis.
503  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
504  assert(L->isLCSSAForm(DT) &&
505         "Must have loops in LCSSA form to track live-out values.");
506
507  LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
508
509  // Simulate execution of each iteration of the loop counting instructions,
510  // which would be simplified.
511  // Since the same load will take different values on different iterations,
512  // we literally have to go through all loop's iterations.
513  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
514    LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
515
516    // Prepare for the iteration by collecting any simplified entry or backedge
517    // inputs.
518    for (Instruction &I : *L->getHeader()) {
519      auto *PHI = dyn_cast<PHINode>(&I);
520      if (!PHI)
521        break;
522
523      // The loop header PHI nodes must have exactly two input: one from the
524      // loop preheader and one from the loop latch.
525      assert(
526          PHI->getNumIncomingValues() == 2 &&
527          "Must have an incoming value only for the preheader and the latch.");
528
529      Value *V = PHI->getIncomingValueForBlock(
530          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
531      Constant *C = dyn_cast<Constant>(V);
532      if (Iteration != 0 && !C)
533        C = SimplifiedValues.lookup(V);
534      if (C)
535        SimplifiedInputValues.push_back({PHI, C});
536    }
537
538    // Now clear and re-populate the map for the next iteration.
539    SimplifiedValues.clear();
540    while (!SimplifiedInputValues.empty())
541      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
542
543    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
544
545    BBWorklist.clear();
546    BBWorklist.insert(L->getHeader());
547    // Note that we *must not* cache the size, this loop grows the worklist.
548    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
549      BasicBlock *BB = BBWorklist[Idx];
550
551      // Visit all instructions in the given basic block and try to simplify
552      // it.  We don't change the actual IR, just count optimization
553      // opportunities.
554      for (Instruction &I : *BB) {
555        // These won't get into the final code - don't even try calculating the
556        // cost for them.
557        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
558          continue;
559
560        // Track this instruction's expected baseline cost when executing the
561        // rolled loop form.
562        RolledDynamicCost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
563
564        // Visit the instruction to analyze its loop cost after unrolling,
565        // and if the visitor returns true, mark the instruction as free after
566        // unrolling and continue.
567        bool IsFree = Analyzer.visit(I);
568        bool Inserted = InstCostMap.insert({&I, (int)Iteration,
569                                           (unsigned)IsFree,
570                                           /*IsCounted*/ false}).second;
571        (void)Inserted;
572        assert(Inserted && "Cannot have a state for an unvisited instruction!");
573
574        if (IsFree)
575          continue;
576
577        // Can't properly model a cost of a call.
578        // FIXME: With a proper cost model we should be able to do it.
579        if (auto *CI = dyn_cast<CallInst>(&I)) {
580          const Function *Callee = CI->getCalledFunction();
581          if (!Callee || TTI.isLoweredToCall(Callee)) {
582            LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
583            return None;
584          }
585        }
586
587        // If the instruction might have a side-effect recursively account for
588        // the cost of it and all the instructions leading up to it.
589        if (I.mayHaveSideEffects())
590          AddCostRecursively(I, Iteration);
591
592        // If unrolled body turns out to be too big, bail out.
593        if (UnrolledCost > MaxUnrolledLoopSize) {
594          LLVM_DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
595                            << "  UnrolledCost: " << UnrolledCost
596                            << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
597                            << "\n");
598          return None;
599        }
600      }
601
602      Instruction *TI = BB->getTerminator();
603
604      // Add in the live successors by first checking whether we have terminator
605      // that may be simplified based on the values simplified by this call.
606      BasicBlock *KnownSucc = nullptr;
607      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
608        if (BI->isConditional()) {
609          if (Constant *SimpleCond =
610                  SimplifiedValues.lookup(BI->getCondition())) {
611            // Just take the first successor if condition is undef
612            if (isa<UndefValue>(SimpleCond))
613              KnownSucc = BI->getSuccessor(0);
614            else if (ConstantInt *SimpleCondVal =
615                         dyn_cast<ConstantInt>(SimpleCond))
616              KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
617          }
618        }
619      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
620        if (Constant *SimpleCond =
621                SimplifiedValues.lookup(SI->getCondition())) {
622          // Just take the first successor if condition is undef
623          if (isa<UndefValue>(SimpleCond))
624            KnownSucc = SI->getSuccessor(0);
625          else if (ConstantInt *SimpleCondVal =
626                       dyn_cast<ConstantInt>(SimpleCond))
627            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
628        }
629      }
630      if (KnownSucc) {
631        if (L->contains(KnownSucc))
632          BBWorklist.insert(KnownSucc);
633        else
634          ExitWorklist.insert({BB, KnownSucc});
635        continue;
636      }
637
638      // Add BB's successors to the worklist.
639      for (BasicBlock *Succ : successors(BB))
640        if (L->contains(Succ))
641          BBWorklist.insert(Succ);
642        else
643          ExitWorklist.insert({BB, Succ});
644      AddCostRecursively(*TI, Iteration);
645    }
646
647    // If we found no optimization opportunities on the first iteration, we
648    // won't find them on later ones too.
649    if (UnrolledCost == RolledDynamicCost) {
650      LLVM_DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
651                        << "  UnrolledCost: " << UnrolledCost << "\n");
652      return None;
653    }
654  }
655
656  while (!ExitWorklist.empty()) {
657    BasicBlock *ExitingBB, *ExitBB;
658    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
659
660    for (Instruction &I : *ExitBB) {
661      auto *PN = dyn_cast<PHINode>(&I);
662      if (!PN)
663        break;
664
665      Value *Op = PN->getIncomingValueForBlock(ExitingBB);
666      if (auto *OpI = dyn_cast<Instruction>(Op))
667        if (L->contains(OpI))
668          AddCostRecursively(*OpI, TripCount - 1);
669    }
670  }
671
672  LLVM_DEBUG(dbgs() << "Analysis finished:\n"
673                    << "UnrolledCost: " << UnrolledCost << ", "
674                    << "RolledDynamicCost: " << RolledDynamicCost << "\n");
675  return {{UnrolledCost, RolledDynamicCost}};
676}
677
678/// ApproximateLoopSize - Approximate the size of the loop.
679unsigned llvm::ApproximateLoopSize(
680    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
681    const TargetTransformInfo &TTI,
682    const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
683  CodeMetrics Metrics;
684  for (BasicBlock *BB : L->blocks())
685    Metrics.analyzeBasicBlock(BB, TTI, EphValues);
686  NumCalls = Metrics.NumInlineCandidates;
687  NotDuplicatable = Metrics.notDuplicatable;
688  Convergent = Metrics.convergent;
689
690  unsigned LoopSize = Metrics.NumInsts;
691
692  // Don't allow an estimate of size zero.  This would allows unrolling of loops
693  // with huge iteration counts, which is a compile time problem even if it's
694  // not a problem for code quality. Also, the code using this size may assume
695  // that each loop has at least three instructions (likely a conditional
696  // branch, a comparison feeding that branch, and some kind of loop increment
697  // feeding that comparison instruction).
698  LoopSize = std::max(LoopSize, BEInsns + 1);
699
700  return LoopSize;
701}
702
703// Returns the loop hint metadata node with the given name (for example,
704// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
705// returned.
706static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
707  if (MDNode *LoopID = L->getLoopID())
708    return GetUnrollMetadata(LoopID, Name);
709  return nullptr;
710}
711
712// Returns true if the loop has an unroll(full) pragma.
713static bool hasUnrollFullPragma(const Loop *L) {
714  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
715}
716
717// Returns true if the loop has an unroll(enable) pragma. This metadata is used
718// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
719static bool hasUnrollEnablePragma(const Loop *L) {
720  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
721}
722
723// Returns true if the loop has an runtime unroll(disable) pragma.
724static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
725  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
726}
727
728// If loop has an unroll_count pragma return the (necessarily
729// positive) value from the pragma.  Otherwise return 0.
730static unsigned unrollCountPragmaValue(const Loop *L) {
731  MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
732  if (MD) {
733    assert(MD->getNumOperands() == 2 &&
734           "Unroll count hint metadata should have two operands.");
735    unsigned Count =
736        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
737    assert(Count >= 1 && "Unroll count must be positive.");
738    return Count;
739  }
740  return 0;
741}
742
743// Computes the boosting factor for complete unrolling.
744// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
745// be beneficial to fully unroll the loop even if unrolledcost is large. We
746// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
747// the unroll threshold.
748static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
749                                            unsigned MaxPercentThresholdBoost) {
750  if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
751    return 100;
752  else if (Cost.UnrolledCost != 0)
753    // The boosting factor is RolledDynamicCost / UnrolledCost
754    return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
755                    MaxPercentThresholdBoost);
756  else
757    return MaxPercentThresholdBoost;
758}
759
760// Returns loop size estimation for unrolled loop.
761static uint64_t getUnrolledLoopSize(
762    unsigned LoopSize,
763    TargetTransformInfo::UnrollingPreferences &UP) {
764  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
765  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
766}
767
768// Returns true if unroll count was set explicitly.
769// Calculates unroll count and writes it to UP.Count.
770// Unless IgnoreUser is true, will also use metadata and command-line options
771// that are specific to to the LoopUnroll pass (which, for instance, are
772// irrelevant for the LoopUnrollAndJam pass).
773// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
774// many LoopUnroll-specific options. The shared functionality should be
775// refactored into it own function.
776bool llvm::computeUnrollCount(
777    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
778    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
779    OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
780    bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
781    TargetTransformInfo::UnrollingPreferences &UP,
782    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
783
784  // Check for explicit Count.
785  // 1st priority is unroll count set by "unroll-count" option.
786  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
787  if (UserUnrollCount) {
788    UP.Count = UnrollCount;
789    UP.AllowExpensiveTripCount = true;
790    UP.Force = true;
791    if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
792      return true;
793  }
794
795  // 2nd priority is unroll count set by pragma.
796  unsigned PragmaCount = unrollCountPragmaValue(L);
797  if (PragmaCount > 0) {
798    UP.Count = PragmaCount;
799    UP.Runtime = true;
800    UP.AllowExpensiveTripCount = true;
801    UP.Force = true;
802    if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
803        getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
804      return true;
805  }
806  bool PragmaFullUnroll = hasUnrollFullPragma(L);
807  if (PragmaFullUnroll && TripCount != 0) {
808    UP.Count = TripCount;
809    if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
810      return false;
811  }
812
813  bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
814  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
815                        PragmaEnableUnroll || UserUnrollCount;
816
817  if (ExplicitUnroll && TripCount != 0) {
818    // If the loop has an unrolling pragma, we want to be more aggressive with
819    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
820    // value which is larger than the default limits.
821    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
822    UP.PartialThreshold =
823        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
824  }
825
826  // 3rd priority is full unroll count.
827  // Full unroll makes sense only when TripCount or its upper bound could be
828  // statically calculated.
829  // Also we need to check if we exceed FullUnrollMaxCount.
830  // If using the upper bound to unroll, TripMultiple should be set to 1 because
831  // we do not know when loop may exit.
832
833  // We can unroll by the upper bound amount if it's generally allowed or if
834  // we know that the loop is executed either the upper bound or zero times.
835  // (MaxOrZero unrolling keeps only the first loop test, so the number of
836  // loop tests remains the same compared to the non-unrolled version, whereas
837  // the generic upper bound unrolling keeps all but the last loop test so the
838  // number of loop tests goes up which may end up being worse on targets with
839  // constrained branch predictor resources so is controlled by an option.)
840  // In addition we only unroll small upper bounds.
841  unsigned FullUnrollMaxTripCount = MaxTripCount;
842  if (!(UP.UpperBound || MaxOrZero) ||
843      FullUnrollMaxTripCount > UnrollMaxUpperBound)
844    FullUnrollMaxTripCount = 0;
845
846  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
847  // compute the former when the latter is zero.
848  unsigned ExactTripCount = TripCount;
849  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
850         "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
851
852  unsigned FullUnrollTripCount =
853      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
854  UP.Count = FullUnrollTripCount;
855  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
856    // When computing the unrolled size, note that BEInsns are not replicated
857    // like the rest of the loop body.
858    if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
859      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
860      TripCount = FullUnrollTripCount;
861      TripMultiple = UP.UpperBound ? 1 : TripMultiple;
862      return ExplicitUnroll;
863    } else {
864      // The loop isn't that small, but we still can fully unroll it if that
865      // helps to remove a significant number of instructions.
866      // To check that, run additional analysis on the loop.
867      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
868              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
869              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
870              UP.MaxIterationsCountToAnalyze)) {
871        unsigned Boost =
872            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
873        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
874          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
875          TripCount = FullUnrollTripCount;
876          TripMultiple = UP.UpperBound ? 1 : TripMultiple;
877          return ExplicitUnroll;
878        }
879      }
880    }
881  }
882
883  // 4th priority is loop peeling.
884  computePeelCount(L, LoopSize, UP, PP, TripCount, SE);
885  if (PP.PeelCount) {
886    UP.Runtime = false;
887    UP.Count = 1;
888    return ExplicitUnroll;
889  }
890
891  // 5th priority is partial unrolling.
892  // Try partial unroll only when TripCount could be statically calculated.
893  if (TripCount) {
894    UP.Partial |= ExplicitUnroll;
895    if (!UP.Partial) {
896      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
897                        << "-unroll-allow-partial not given\n");
898      UP.Count = 0;
899      return false;
900    }
901    if (UP.Count == 0)
902      UP.Count = TripCount;
903    if (UP.PartialThreshold != NoThreshold) {
904      // Reduce unroll count to be modulo of TripCount for partial unrolling.
905      if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
906        UP.Count =
907            (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
908            (LoopSize - UP.BEInsns);
909      if (UP.Count > UP.MaxCount)
910        UP.Count = UP.MaxCount;
911      while (UP.Count != 0 && TripCount % UP.Count != 0)
912        UP.Count--;
913      if (UP.AllowRemainder && UP.Count <= 1) {
914        // If there is no Count that is modulo of TripCount, set Count to
915        // largest power-of-two factor that satisfies the threshold limit.
916        // As we'll create fixup loop, do the type of unrolling only if
917        // remainder loop is allowed.
918        UP.Count = UP.DefaultUnrollRuntimeCount;
919        while (UP.Count != 0 &&
920               getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
921          UP.Count >>= 1;
922      }
923      if (UP.Count < 2) {
924        if (PragmaEnableUnroll)
925          ORE->emit([&]() {
926            return OptimizationRemarkMissed(DEBUG_TYPE,
927                                            "UnrollAsDirectedTooLarge",
928                                            L->getStartLoc(), L->getHeader())
929                   << "Unable to unroll loop as directed by unroll(enable) "
930                      "pragma "
931                      "because unrolled size is too large.";
932          });
933        UP.Count = 0;
934      }
935    } else {
936      UP.Count = TripCount;
937    }
938    if (UP.Count > UP.MaxCount)
939      UP.Count = UP.MaxCount;
940    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
941        UP.Count != TripCount)
942      ORE->emit([&]() {
943        return OptimizationRemarkMissed(DEBUG_TYPE,
944                                        "FullUnrollAsDirectedTooLarge",
945                                        L->getStartLoc(), L->getHeader())
946               << "Unable to fully unroll loop as directed by unroll pragma "
947                  "because "
948                  "unrolled size is too large.";
949      });
950    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
951                      << "\n");
952    return ExplicitUnroll;
953  }
954  assert(TripCount == 0 &&
955         "All cases when TripCount is constant should be covered here.");
956  if (PragmaFullUnroll)
957    ORE->emit([&]() {
958      return OptimizationRemarkMissed(
959                 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
960                 L->getStartLoc(), L->getHeader())
961             << "Unable to fully unroll loop as directed by unroll(full) "
962                "pragma "
963                "because loop has a runtime trip count.";
964    });
965
966  // 6th priority is runtime unrolling.
967  // Don't unroll a runtime trip count loop when it is disabled.
968  if (hasRuntimeUnrollDisablePragma(L)) {
969    UP.Count = 0;
970    return false;
971  }
972
973  // Don't unroll a small upper bound loop unless user or TTI asked to do so.
974  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
975    UP.Count = 0;
976    return false;
977  }
978
979  // Check if the runtime trip count is too small when profile is available.
980  if (L->getHeader()->getParent()->hasProfileData()) {
981    if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
982      if (*ProfileTripCount < FlatLoopTripCountThreshold)
983        return false;
984      else
985        UP.AllowExpensiveTripCount = true;
986    }
987  }
988
989  // Reduce count based on the type of unrolling and the threshold values.
990  UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
991  if (!UP.Runtime) {
992    LLVM_DEBUG(
993        dbgs() << "  will not try to unroll loop with runtime trip count "
994               << "-unroll-runtime not given\n");
995    UP.Count = 0;
996    return false;
997  }
998  if (UP.Count == 0)
999    UP.Count = UP.DefaultUnrollRuntimeCount;
1000
1001  // Reduce unroll count to be the largest power-of-two factor of
1002  // the original count which satisfies the threshold limit.
1003  while (UP.Count != 0 &&
1004         getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
1005    UP.Count >>= 1;
1006
1007#ifndef NDEBUG
1008  unsigned OrigCount = UP.Count;
1009#endif
1010
1011  if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
1012    while (UP.Count != 0 && TripMultiple % UP.Count != 0)
1013      UP.Count >>= 1;
1014    LLVM_DEBUG(
1015        dbgs() << "Remainder loop is restricted (that could architecture "
1016                  "specific or because the loop contains a convergent "
1017                  "instruction), so unroll count must divide the trip "
1018                  "multiple, "
1019               << TripMultiple << ".  Reducing unroll count from " << OrigCount
1020               << " to " << UP.Count << ".\n");
1021
1022    using namespace ore;
1023
1024    if (PragmaCount > 0 && !UP.AllowRemainder)
1025      ORE->emit([&]() {
1026        return OptimizationRemarkMissed(DEBUG_TYPE,
1027                                        "DifferentUnrollCountFromDirected",
1028                                        L->getStartLoc(), L->getHeader())
1029               << "Unable to unroll loop the number of times directed by "
1030                  "unroll_count pragma because remainder loop is restricted "
1031                  "(that could architecture specific or because the loop "
1032                  "contains a convergent instruction) and so must have an "
1033                  "unroll "
1034                  "count that divides the loop trip multiple of "
1035               << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
1036               << NV("UnrollCount", UP.Count) << " time(s).";
1037      });
1038  }
1039
1040  if (UP.Count > UP.MaxCount)
1041    UP.Count = UP.MaxCount;
1042
1043  if (MaxTripCount && UP.Count > MaxTripCount)
1044    UP.Count = MaxTripCount;
1045
1046  LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
1047                    << "\n");
1048  if (UP.Count < 2)
1049    UP.Count = 0;
1050  return ExplicitUnroll;
1051}
1052
1053static LoopUnrollResult tryToUnrollLoop(
1054    Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
1055    const TargetTransformInfo &TTI, AssumptionCache &AC,
1056    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
1057    ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1058    bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
1059    Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
1060    Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
1061    Optional<bool> ProvidedAllowPeeling,
1062    Optional<bool> ProvidedAllowProfileBasedPeeling,
1063    Optional<unsigned> ProvidedFullUnrollMaxCount) {
1064  LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1065                    << L->getHeader()->getParent()->getName() << "] Loop %"
1066                    << L->getHeader()->getName() << "\n");
1067  TransformationMode TM = hasUnrollTransformation(L);
1068  if (TM & TM_Disable)
1069    return LoopUnrollResult::Unmodified;
1070  if (!L->isLoopSimplifyForm()) {
1071    LLVM_DEBUG(
1072        dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
1073    return LoopUnrollResult::Unmodified;
1074  }
1075
1076  // When automtatic unrolling is disabled, do not unroll unless overridden for
1077  // this loop.
1078  if (OnlyWhenForced && !(TM & TM_Enable))
1079    return LoopUnrollResult::Unmodified;
1080
1081  bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1082  unsigned NumInlineCandidates;
1083  bool NotDuplicatable;
1084  bool Convergent;
1085  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
1086      L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
1087      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1088      ProvidedFullUnrollMaxCount);
1089  TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
1090      L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling);
1091
1092  // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1093  // as threshold later on.
1094  if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1095      !OptForSize)
1096    return LoopUnrollResult::Unmodified;
1097
1098  SmallPtrSet<const Value *, 32> EphValues;
1099  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1100
1101  unsigned LoopSize =
1102      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
1103                          TTI, EphValues, UP.BEInsns);
1104  LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
1105  if (NotDuplicatable) {
1106    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
1107                      << " instructions.\n");
1108    return LoopUnrollResult::Unmodified;
1109  }
1110
1111  // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1112  // later), to (fully) unroll loops, if it does not increase code size.
1113  if (OptForSize)
1114    UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1115
1116  if (NumInlineCandidates != 0) {
1117    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
1118    return LoopUnrollResult::Unmodified;
1119  }
1120
1121  // Find trip count and trip multiple if count is not available
1122  unsigned TripCount = 0;
1123  unsigned TripMultiple = 1;
1124  // If there are multiple exiting blocks but one of them is the latch, use the
1125  // latch for the trip count estimation. Otherwise insist on a single exiting
1126  // block for the trip count estimation.
1127  BasicBlock *ExitingBlock = L->getLoopLatch();
1128  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1129    ExitingBlock = L->getExitingBlock();
1130  if (ExitingBlock) {
1131    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
1132    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1133  }
1134
1135  // If the loop contains a convergent operation, the prelude we'd add
1136  // to do the first few instructions before we hit the unrolled loop
1137  // is unsafe -- it adds a control-flow dependency to the convergent
1138  // operation.  Therefore restrict remainder loop (try unrollig without).
1139  //
1140  // TODO: This is quite conservative.  In practice, convergent_op()
1141  // is likely to be called unconditionally in the loop.  In this
1142  // case, the program would be ill-formed (on most architectures)
1143  // unless n were the same on all threads in a thread group.
1144  // Assuming n is the same on all threads, any kind of unrolling is
1145  // safe.  But currently llvm's notion of convergence isn't powerful
1146  // enough to express this.
1147  if (Convergent)
1148    UP.AllowRemainder = false;
1149
1150  // Try to find the trip count upper bound if we cannot find the exact trip
1151  // count.
1152  unsigned MaxTripCount = 0;
1153  bool MaxOrZero = false;
1154  if (!TripCount) {
1155    MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1156    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1157  }
1158
1159  // computeUnrollCount() decides whether it is beneficial to use upper bound to
1160  // fully unroll the loop.
1161  bool UseUpperBound = false;
1162  bool IsCountSetExplicitly = computeUnrollCount(
1163      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
1164      TripMultiple, LoopSize, UP, PP, UseUpperBound);
1165  if (!UP.Count)
1166    return LoopUnrollResult::Unmodified;
1167  // Unroll factor (Count) must be less or equal to TripCount.
1168  if (TripCount && UP.Count > TripCount)
1169    UP.Count = TripCount;
1170
1171  // Save loop properties before it is transformed.
1172  MDNode *OrigLoopID = L->getLoopID();
1173
1174  // Unroll the loop.
1175  Loop *RemainderLoop = nullptr;
1176  LoopUnrollResult UnrollResult = UnrollLoop(
1177      L,
1178      {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
1179       UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder,
1180       ForgetAllSCEV},
1181      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
1182  if (UnrollResult == LoopUnrollResult::Unmodified)
1183    return LoopUnrollResult::Unmodified;
1184
1185  if (RemainderLoop) {
1186    Optional<MDNode *> RemainderLoopID =
1187        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1188                                        LLVMLoopUnrollFollowupRemainder});
1189    if (RemainderLoopID.hasValue())
1190      RemainderLoop->setLoopID(RemainderLoopID.getValue());
1191  }
1192
1193  if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1194    Optional<MDNode *> NewLoopID =
1195        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1196                                        LLVMLoopUnrollFollowupUnrolled});
1197    if (NewLoopID.hasValue()) {
1198      L->setLoopID(NewLoopID.getValue());
1199
1200      // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1201      // explicitly.
1202      return UnrollResult;
1203    }
1204  }
1205
1206  // If loop has an unroll count pragma or unrolled by explicitly set count
1207  // mark loop as unrolled to prevent unrolling beyond that requested.
1208  // If the loop was peeled, we already "used up" the profile information
1209  // we had, so we don't want to unroll or peel again.
1210  if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
1211      (IsCountSetExplicitly || (PP.PeelProfiledIterations && PP.PeelCount)))
1212    L->setLoopAlreadyUnrolled();
1213
1214  return UnrollResult;
1215}
1216
1217namespace {
1218
1219class LoopUnroll : public LoopPass {
1220public:
1221  static char ID; // Pass ID, replacement for typeid
1222
1223  int OptLevel;
1224
1225  /// If false, use a cost model to determine whether unrolling of a loop is
1226  /// profitable. If true, only loops that explicitly request unrolling via
1227  /// metadata are considered. All other loops are skipped.
1228  bool OnlyWhenForced;
1229
1230  /// If false, when SCEV is invalidated, only forget everything in the
1231  /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1232  /// Otherwise, forgetAllLoops and rebuild when needed next.
1233  bool ForgetAllSCEV;
1234
1235  Optional<unsigned> ProvidedCount;
1236  Optional<unsigned> ProvidedThreshold;
1237  Optional<bool> ProvidedAllowPartial;
1238  Optional<bool> ProvidedRuntime;
1239  Optional<bool> ProvidedUpperBound;
1240  Optional<bool> ProvidedAllowPeeling;
1241  Optional<bool> ProvidedAllowProfileBasedPeeling;
1242  Optional<unsigned> ProvidedFullUnrollMaxCount;
1243
1244  LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1245             bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
1246             Optional<unsigned> Count = None,
1247             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
1248             Optional<bool> UpperBound = None,
1249             Optional<bool> AllowPeeling = None,
1250             Optional<bool> AllowProfileBasedPeeling = None,
1251             Optional<unsigned> ProvidedFullUnrollMaxCount = None)
1252      : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1253        ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1254        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1255        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1256        ProvidedAllowPeeling(AllowPeeling),
1257        ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1258        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1259    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
1260  }
1261
1262  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1263    if (skipLoop(L))
1264      return false;
1265
1266    Function &F = *L->getHeader()->getParent();
1267
1268    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1269    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1270    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1271    const TargetTransformInfo &TTI =
1272        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1273    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1274    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1275    // pass.  Function analyses need to be preserved across loop transformations
1276    // but ORE cannot be preserved (see comment before the pass definition).
1277    OptimizationRemarkEmitter ORE(&F);
1278    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1279
1280    LoopUnrollResult Result = tryToUnrollLoop(
1281        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1282        OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
1283        ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1284        ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1285        ProvidedFullUnrollMaxCount);
1286
1287    if (Result == LoopUnrollResult::FullyUnrolled)
1288      LPM.markLoopAsDeleted(*L);
1289
1290    return Result != LoopUnrollResult::Unmodified;
1291  }
1292
1293  /// This transformation requires natural loop information & requires that
1294  /// loop preheaders be inserted into the CFG...
1295  void getAnalysisUsage(AnalysisUsage &AU) const override {
1296    AU.addRequired<AssumptionCacheTracker>();
1297    AU.addRequired<TargetTransformInfoWrapperPass>();
1298    // FIXME: Loop passes are required to preserve domtree, and for now we just
1299    // recreate dom info if anything gets unrolled.
1300    getLoopAnalysisUsage(AU);
1301  }
1302};
1303
1304} // end anonymous namespace
1305
1306char LoopUnroll::ID = 0;
1307
1308INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1309INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1310INITIALIZE_PASS_DEPENDENCY(LoopPass)
1311INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1312INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1313
1314Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1315                                 bool ForgetAllSCEV, int Threshold, int Count,
1316                                 int AllowPartial, int Runtime, int UpperBound,
1317                                 int AllowPeeling) {
1318  // TODO: It would make more sense for this function to take the optionals
1319  // directly, but that's dangerous since it would silently break out of tree
1320  // callers.
1321  return new LoopUnroll(
1322      OptLevel, OnlyWhenForced, ForgetAllSCEV,
1323      Threshold == -1 ? None : Optional<unsigned>(Threshold),
1324      Count == -1 ? None : Optional<unsigned>(Count),
1325      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
1326      Runtime == -1 ? None : Optional<bool>(Runtime),
1327      UpperBound == -1 ? None : Optional<bool>(UpperBound),
1328      AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
1329}
1330
1331Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1332                                       bool ForgetAllSCEV) {
1333  return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
1334                              0, 0, 0, 0);
1335}
1336
1337PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
1338                                          LoopStandardAnalysisResults &AR,
1339                                          LPMUpdater &Updater) {
1340  // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1341  // pass. Function analyses need to be preserved across loop transformations
1342  // but ORE cannot be preserved (see comment before the pass definition).
1343  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1344
1345  // Keep track of the previous loop structure so we can identify new loops
1346  // created by unrolling.
1347  Loop *ParentL = L.getParentLoop();
1348  SmallPtrSet<Loop *, 4> OldLoops;
1349  if (ParentL)
1350    OldLoops.insert(ParentL->begin(), ParentL->end());
1351  else
1352    OldLoops.insert(AR.LI.begin(), AR.LI.end());
1353
1354  std::string LoopName = std::string(L.getName());
1355
1356  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
1357                                 /*BFI*/ nullptr, /*PSI*/ nullptr,
1358                                 /*PreserveLCSSA*/ true, OptLevel,
1359                                 OnlyWhenForced, ForgetSCEV, /*Count*/ None,
1360                                 /*Threshold*/ None, /*AllowPartial*/ false,
1361                                 /*Runtime*/ false, /*UpperBound*/ false,
1362                                 /*AllowPeeling*/ false,
1363                                 /*AllowProfileBasedPeeling*/ false,
1364                                 /*FullUnrollMaxCount*/ None) !=
1365                 LoopUnrollResult::Unmodified;
1366  if (!Changed)
1367    return PreservedAnalyses::all();
1368
1369  // The parent must not be damaged by unrolling!
1370#ifndef NDEBUG
1371  if (ParentL)
1372    ParentL->verifyLoop();
1373#endif
1374
1375  // Unrolling can do several things to introduce new loops into a loop nest:
1376  // - Full unrolling clones child loops within the current loop but then
1377  //   removes the current loop making all of the children appear to be new
1378  //   sibling loops.
1379  //
1380  // When a new loop appears as a sibling loop after fully unrolling,
1381  // its nesting structure has fundamentally changed and we want to revisit
1382  // it to reflect that.
1383  //
1384  // When unrolling has removed the current loop, we need to tell the
1385  // infrastructure that it is gone.
1386  //
1387  // Finally, we support a debugging/testing mode where we revisit child loops
1388  // as well. These are not expected to require further optimizations as either
1389  // they or the loop they were cloned from have been directly visited already.
1390  // But the debugging mode allows us to check this assumption.
1391  bool IsCurrentLoopValid = false;
1392  SmallVector<Loop *, 4> SibLoops;
1393  if (ParentL)
1394    SibLoops.append(ParentL->begin(), ParentL->end());
1395  else
1396    SibLoops.append(AR.LI.begin(), AR.LI.end());
1397  erase_if(SibLoops, [&](Loop *SibLoop) {
1398    if (SibLoop == &L) {
1399      IsCurrentLoopValid = true;
1400      return true;
1401    }
1402
1403    // Otherwise erase the loop from the list if it was in the old loops.
1404    return OldLoops.count(SibLoop) != 0;
1405  });
1406  Updater.addSiblingLoops(SibLoops);
1407
1408  if (!IsCurrentLoopValid) {
1409    Updater.markLoopAsDeleted(L, LoopName);
1410  } else {
1411    // We can only walk child loops if the current loop remained valid.
1412    if (UnrollRevisitChildLoops) {
1413      // Walk *all* of the child loops.
1414      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1415      Updater.addChildLoops(ChildLoops);
1416    }
1417  }
1418
1419  return getLoopPassPreservedAnalyses();
1420}
1421
1422PreservedAnalyses LoopUnrollPass::run(Function &F,
1423                                      FunctionAnalysisManager &AM) {
1424  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1425  auto &LI = AM.getResult<LoopAnalysis>(F);
1426  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1427  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1428  auto &AC = AM.getResult<AssumptionAnalysis>(F);
1429  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1430
1431  LoopAnalysisManager *LAM = nullptr;
1432  if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1433    LAM = &LAMProxy->getManager();
1434
1435  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1436  ProfileSummaryInfo *PSI =
1437      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1438  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1439      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1440
1441  bool Changed = false;
1442
1443  // The unroller requires loops to be in simplified form, and also needs LCSSA.
1444  // Since simplification may add new inner loops, it has to run before the
1445  // legality and profitability checks. This means running the loop unroller
1446  // will simplify all loops, regardless of whether anything end up being
1447  // unrolled.
1448  for (auto &L : LI) {
1449    Changed |=
1450        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1451    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1452  }
1453
1454  // Add the loop nests in the reverse order of LoopInfo. See method
1455  // declaration.
1456  SmallPriorityWorklist<Loop *, 4> Worklist;
1457  appendLoopsToWorklist(LI, Worklist);
1458
1459  while (!Worklist.empty()) {
1460    // Because the LoopInfo stores the loops in RPO, we walk the worklist
1461    // from back to front so that we work forward across the CFG, which
1462    // for unrolling is only needed to get optimization remarks emitted in
1463    // a forward order.
1464    Loop &L = *Worklist.pop_back_val();
1465#ifndef NDEBUG
1466    Loop *ParentL = L.getParentLoop();
1467#endif
1468
1469    // Check if the profile summary indicates that the profiled application
1470    // has a huge working set size, in which case we disable peeling to avoid
1471    // bloating it further.
1472    Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1473    if (PSI && PSI->hasHugeWorkingSetSize())
1474      LocalAllowPeeling = false;
1475    std::string LoopName = std::string(L.getName());
1476    // The API here is quite complex to call and we allow to select some
1477    // flavors of unrolling during construction time (by setting UnrollOpts).
1478    LoopUnrollResult Result = tryToUnrollLoop(
1479        &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1480        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
1481        UnrollOpts.ForgetSCEV, /*Count*/ None,
1482        /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
1483        UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1484        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
1485    Changed |= Result != LoopUnrollResult::Unmodified;
1486
1487    // The parent must not be damaged by unrolling!
1488#ifndef NDEBUG
1489    if (Result != LoopUnrollResult::Unmodified && ParentL)
1490      ParentL->verifyLoop();
1491#endif
1492
1493    // Clear any cached analysis results for L if we removed it completely.
1494    if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1495      LAM->clear(L, LoopName);
1496  }
1497
1498  if (!Changed)
1499    return PreservedAnalyses::all();
1500
1501  return getLoopPassPreservedAnalyses();
1502}
1503