LoopStrengthReduce.cpp revision 303975
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This transformation analyzes and transforms the induction variables (and
11// computations derived from them) into forms suitable for efficient execution
12// on the target.
13//
14// This pass performs a strength reduction on array references inside loops that
15// have as one or more of their components the loop induction variable, it
16// rewrites expressions to take advantage of scaled-index addressing modes
17// available on the target, and it performs a variety of other optimizations
18// related to loop induction variables.
19//
20// Terminology note: this code has a lot of handling for "post-increment" or
21// "post-inc" users. This is not talking about post-increment addressing modes;
22// it is instead talking about code like this:
23//
24//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
25//   ...
26//   %i.next = add %i, 1
27//   %c = icmp eq %i.next, %n
28//
29// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
30// it's useful to think about these as the same register, with some uses using
31// the value of the register before the add and some using it after. In this
32// example, the icmp is a post-increment user, since it uses %i.next, which is
33// the value of the induction variable after the increment. The other common
34// case of post-increment users is users outside the loop.
35//
36// TODO: More sophistication in the way Formulae are generated and filtered.
37//
38// TODO: Handle multiple loops at a time.
39//
40// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
41//       of a GlobalValue?
42//
43// TODO: When truncation is free, truncate ICmp users' operands to make it a
44//       smaller encoding (on x86 at least).
45//
46// TODO: When a negated register is used by an add (such as in a list of
47//       multiple base registers, or as the increment expression in an addrec),
48//       we may not actually need both reg and (-1 * reg) in registers; the
49//       negation can be implemented by using a sub instead of an add. The
50//       lack of support for taking this into consideration when making
51//       register pressure decisions is partly worked around by the "Special"
52//       use kind.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Scalar.h"
57#include "llvm/ADT/DenseSet.h"
58#include "llvm/ADT/Hashing.h"
59#include "llvm/ADT/STLExtras.h"
60#include "llvm/ADT/SetVector.h"
61#include "llvm/ADT/SmallBitVector.h"
62#include "llvm/Analysis/IVUsers.h"
63#include "llvm/Analysis/LoopPass.h"
64#include "llvm/Analysis/ScalarEvolutionExpander.h"
65#include "llvm/Analysis/TargetTransformInfo.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DerivedTypes.h"
68#include "llvm/IR/Dominators.h"
69#include "llvm/IR/Instructions.h"
70#include "llvm/IR/IntrinsicInst.h"
71#include "llvm/IR/Module.h"
72#include "llvm/IR/ValueHandle.h"
73#include "llvm/Support/CommandLine.h"
74#include "llvm/Support/Debug.h"
75#include "llvm/Support/raw_ostream.h"
76#include "llvm/Transforms/Utils/BasicBlockUtils.h"
77#include "llvm/Transforms/Utils/Local.h"
78#include <algorithm>
79using namespace llvm;
80
81#define DEBUG_TYPE "loop-reduce"
82
83/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
84/// bail out. This threshold is far beyond the number of users that LSR can
85/// conceivably solve, so it should not affect generated code, but catches the
86/// worst cases before LSR burns too much compile time and stack space.
87static const unsigned MaxIVUsers = 200;
88
89// Temporary flag to cleanup congruent phis after LSR phi expansion.
90// It's currently disabled until we can determine whether it's truly useful or
91// not. The flag should be removed after the v3.0 release.
92// This is now needed for ivchains.
93static cl::opt<bool> EnablePhiElim(
94  "enable-lsr-phielim", cl::Hidden, cl::init(true),
95  cl::desc("Enable LSR phi elimination"));
96
97#ifndef NDEBUG
98// Stress test IV chain generation.
99static cl::opt<bool> StressIVChain(
100  "stress-ivchain", cl::Hidden, cl::init(false),
101  cl::desc("Stress test LSR IV chains"));
102#else
103static bool StressIVChain = false;
104#endif
105
106namespace {
107
108struct MemAccessTy {
109  /// Used in situations where the accessed memory type is unknown.
110  static const unsigned UnknownAddressSpace = ~0u;
111
112  Type *MemTy;
113  unsigned AddrSpace;
114
115  MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
116
117  MemAccessTy(Type *Ty, unsigned AS) :
118    MemTy(Ty), AddrSpace(AS) {}
119
120  bool operator==(MemAccessTy Other) const {
121    return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
122  }
123
124  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
125
126  static MemAccessTy getUnknown(LLVMContext &Ctx) {
127    return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace);
128  }
129};
130
131/// This class holds data which is used to order reuse candidates.
132class RegSortData {
133public:
134  /// This represents the set of LSRUse indices which reference
135  /// a particular register.
136  SmallBitVector UsedByIndices;
137
138  void print(raw_ostream &OS) const;
139  void dump() const;
140};
141
142}
143
144void RegSortData::print(raw_ostream &OS) const {
145  OS << "[NumUses=" << UsedByIndices.count() << ']';
146}
147
148LLVM_DUMP_METHOD
149void RegSortData::dump() const {
150  print(errs()); errs() << '\n';
151}
152
153namespace {
154
155/// Map register candidates to information about how they are used.
156class RegUseTracker {
157  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
158
159  RegUsesTy RegUsesMap;
160  SmallVector<const SCEV *, 16> RegSequence;
161
162public:
163  void countRegister(const SCEV *Reg, size_t LUIdx);
164  void dropRegister(const SCEV *Reg, size_t LUIdx);
165  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
166
167  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
168
169  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
170
171  void clear();
172
173  typedef SmallVectorImpl<const SCEV *>::iterator iterator;
174  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
175  iterator begin() { return RegSequence.begin(); }
176  iterator end()   { return RegSequence.end(); }
177  const_iterator begin() const { return RegSequence.begin(); }
178  const_iterator end() const   { return RegSequence.end(); }
179};
180
181}
182
183void
184RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
185  std::pair<RegUsesTy::iterator, bool> Pair =
186    RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
187  RegSortData &RSD = Pair.first->second;
188  if (Pair.second)
189    RegSequence.push_back(Reg);
190  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
191  RSD.UsedByIndices.set(LUIdx);
192}
193
194void
195RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
196  RegUsesTy::iterator It = RegUsesMap.find(Reg);
197  assert(It != RegUsesMap.end());
198  RegSortData &RSD = It->second;
199  assert(RSD.UsedByIndices.size() > LUIdx);
200  RSD.UsedByIndices.reset(LUIdx);
201}
202
203void
204RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
205  assert(LUIdx <= LastLUIdx);
206
207  // Update RegUses. The data structure is not optimized for this purpose;
208  // we must iterate through it and update each of the bit vectors.
209  for (auto &Pair : RegUsesMap) {
210    SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
211    if (LUIdx < UsedByIndices.size())
212      UsedByIndices[LUIdx] =
213        LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
214    UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
215  }
216}
217
218bool
219RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
220  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
221  if (I == RegUsesMap.end())
222    return false;
223  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
224  int i = UsedByIndices.find_first();
225  if (i == -1) return false;
226  if ((size_t)i != LUIdx) return true;
227  return UsedByIndices.find_next(i) != -1;
228}
229
230const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
231  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
232  assert(I != RegUsesMap.end() && "Unknown register!");
233  return I->second.UsedByIndices;
234}
235
236void RegUseTracker::clear() {
237  RegUsesMap.clear();
238  RegSequence.clear();
239}
240
241namespace {
242
243/// This class holds information that describes a formula for computing
244/// satisfying a use. It may include broken-out immediates and scaled registers.
245struct Formula {
246  /// Global base address used for complex addressing.
247  GlobalValue *BaseGV;
248
249  /// Base offset for complex addressing.
250  int64_t BaseOffset;
251
252  /// Whether any complex addressing has a base register.
253  bool HasBaseReg;
254
255  /// The scale of any complex addressing.
256  int64_t Scale;
257
258  /// The list of "base" registers for this use. When this is non-empty. The
259  /// canonical representation of a formula is
260  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
261  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
262  /// #1 enforces that the scaled register is always used when at least two
263  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
264  /// #2 enforces that 1 * reg is reg.
265  /// This invariant can be temporarly broken while building a formula.
266  /// However, every formula inserted into the LSRInstance must be in canonical
267  /// form.
268  SmallVector<const SCEV *, 4> BaseRegs;
269
270  /// The 'scaled' register for this use. This should be non-null when Scale is
271  /// not zero.
272  const SCEV *ScaledReg;
273
274  /// An additional constant offset which added near the use. This requires a
275  /// temporary register, but the offset itself can live in an add immediate
276  /// field rather than a register.
277  int64_t UnfoldedOffset;
278
279  Formula()
280      : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
281        ScaledReg(nullptr), UnfoldedOffset(0) {}
282
283  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
284
285  bool isCanonical() const;
286
287  void canonicalize();
288
289  bool unscale();
290
291  size_t getNumRegs() const;
292  Type *getType() const;
293
294  void deleteBaseReg(const SCEV *&S);
295
296  bool referencesReg(const SCEV *S) const;
297  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
298                                  const RegUseTracker &RegUses) const;
299
300  void print(raw_ostream &OS) const;
301  void dump() const;
302};
303
304}
305
306/// Recursion helper for initialMatch.
307static void DoInitialMatch(const SCEV *S, Loop *L,
308                           SmallVectorImpl<const SCEV *> &Good,
309                           SmallVectorImpl<const SCEV *> &Bad,
310                           ScalarEvolution &SE) {
311  // Collect expressions which properly dominate the loop header.
312  if (SE.properlyDominates(S, L->getHeader())) {
313    Good.push_back(S);
314    return;
315  }
316
317  // Look at add operands.
318  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
319    for (const SCEV *S : Add->operands())
320      DoInitialMatch(S, L, Good, Bad, SE);
321    return;
322  }
323
324  // Look at addrec operands.
325  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
326    if (!AR->getStart()->isZero()) {
327      DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
328      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
329                                      AR->getStepRecurrence(SE),
330                                      // FIXME: AR->getNoWrapFlags()
331                                      AR->getLoop(), SCEV::FlagAnyWrap),
332                     L, Good, Bad, SE);
333      return;
334    }
335
336  // Handle a multiplication by -1 (negation) if it didn't fold.
337  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
338    if (Mul->getOperand(0)->isAllOnesValue()) {
339      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
340      const SCEV *NewMul = SE.getMulExpr(Ops);
341
342      SmallVector<const SCEV *, 4> MyGood;
343      SmallVector<const SCEV *, 4> MyBad;
344      DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
345      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
346        SE.getEffectiveSCEVType(NewMul->getType())));
347      for (const SCEV *S : MyGood)
348        Good.push_back(SE.getMulExpr(NegOne, S));
349      for (const SCEV *S : MyBad)
350        Bad.push_back(SE.getMulExpr(NegOne, S));
351      return;
352    }
353
354  // Ok, we can't do anything interesting. Just stuff the whole thing into a
355  // register and hope for the best.
356  Bad.push_back(S);
357}
358
359/// Incorporate loop-variant parts of S into this Formula, attempting to keep
360/// all loop-invariant and loop-computable values in a single base register.
361void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
362  SmallVector<const SCEV *, 4> Good;
363  SmallVector<const SCEV *, 4> Bad;
364  DoInitialMatch(S, L, Good, Bad, SE);
365  if (!Good.empty()) {
366    const SCEV *Sum = SE.getAddExpr(Good);
367    if (!Sum->isZero())
368      BaseRegs.push_back(Sum);
369    HasBaseReg = true;
370  }
371  if (!Bad.empty()) {
372    const SCEV *Sum = SE.getAddExpr(Bad);
373    if (!Sum->isZero())
374      BaseRegs.push_back(Sum);
375    HasBaseReg = true;
376  }
377  canonicalize();
378}
379
380/// \brief Check whether or not this formula statisfies the canonical
381/// representation.
382/// \see Formula::BaseRegs.
383bool Formula::isCanonical() const {
384  if (ScaledReg)
385    return Scale != 1 || !BaseRegs.empty();
386  return BaseRegs.size() <= 1;
387}
388
389/// \brief Helper method to morph a formula into its canonical representation.
390/// \see Formula::BaseRegs.
391/// Every formula having more than one base register, must use the ScaledReg
392/// field. Otherwise, we would have to do special cases everywhere in LSR
393/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
394/// On the other hand, 1*reg should be canonicalized into reg.
395void Formula::canonicalize() {
396  if (isCanonical())
397    return;
398  // So far we did not need this case. This is easy to implement but it is
399  // useless to maintain dead code. Beside it could hurt compile time.
400  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
401  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
402  ScaledReg = BaseRegs.back();
403  BaseRegs.pop_back();
404  Scale = 1;
405  size_t BaseRegsSize = BaseRegs.size();
406  size_t Try = 0;
407  // If ScaledReg is an invariant, try to find a variant expression.
408  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
409    std::swap(ScaledReg, BaseRegs[Try++]);
410}
411
412/// \brief Get rid of the scale in the formula.
413/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
414/// \return true if it was possible to get rid of the scale, false otherwise.
415/// \note After this operation the formula may not be in the canonical form.
416bool Formula::unscale() {
417  if (Scale != 1)
418    return false;
419  Scale = 0;
420  BaseRegs.push_back(ScaledReg);
421  ScaledReg = nullptr;
422  return true;
423}
424
425/// Return the total number of register operands used by this formula. This does
426/// not include register uses implied by non-constant addrec strides.
427size_t Formula::getNumRegs() const {
428  return !!ScaledReg + BaseRegs.size();
429}
430
431/// Return the type of this formula, if it has one, or null otherwise. This type
432/// is meaningless except for the bit size.
433Type *Formula::getType() const {
434  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
435         ScaledReg ? ScaledReg->getType() :
436         BaseGV ? BaseGV->getType() :
437         nullptr;
438}
439
440/// Delete the given base reg from the BaseRegs list.
441void Formula::deleteBaseReg(const SCEV *&S) {
442  if (&S != &BaseRegs.back())
443    std::swap(S, BaseRegs.back());
444  BaseRegs.pop_back();
445}
446
447/// Test if this formula references the given register.
448bool Formula::referencesReg(const SCEV *S) const {
449  return S == ScaledReg ||
450         std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
451}
452
453/// Test whether this formula uses registers which are used by uses other than
454/// the use with the given index.
455bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
456                                         const RegUseTracker &RegUses) const {
457  if (ScaledReg)
458    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
459      return true;
460  for (const SCEV *BaseReg : BaseRegs)
461    if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
462      return true;
463  return false;
464}
465
466void Formula::print(raw_ostream &OS) const {
467  bool First = true;
468  if (BaseGV) {
469    if (!First) OS << " + "; else First = false;
470    BaseGV->printAsOperand(OS, /*PrintType=*/false);
471  }
472  if (BaseOffset != 0) {
473    if (!First) OS << " + "; else First = false;
474    OS << BaseOffset;
475  }
476  for (const SCEV *BaseReg : BaseRegs) {
477    if (!First) OS << " + "; else First = false;
478    OS << "reg(" << *BaseReg << ')';
479  }
480  if (HasBaseReg && BaseRegs.empty()) {
481    if (!First) OS << " + "; else First = false;
482    OS << "**error: HasBaseReg**";
483  } else if (!HasBaseReg && !BaseRegs.empty()) {
484    if (!First) OS << " + "; else First = false;
485    OS << "**error: !HasBaseReg**";
486  }
487  if (Scale != 0) {
488    if (!First) OS << " + "; else First = false;
489    OS << Scale << "*reg(";
490    if (ScaledReg)
491      OS << *ScaledReg;
492    else
493      OS << "<unknown>";
494    OS << ')';
495  }
496  if (UnfoldedOffset != 0) {
497    if (!First) OS << " + ";
498    OS << "imm(" << UnfoldedOffset << ')';
499  }
500}
501
502LLVM_DUMP_METHOD
503void Formula::dump() const {
504  print(errs()); errs() << '\n';
505}
506
507/// Return true if the given addrec can be sign-extended without changing its
508/// value.
509static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
510  Type *WideTy =
511    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
512  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
513}
514
515/// Return true if the given add can be sign-extended without changing its
516/// value.
517static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
518  Type *WideTy =
519    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
520  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
521}
522
523/// Return true if the given mul can be sign-extended without changing its
524/// value.
525static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
526  Type *WideTy =
527    IntegerType::get(SE.getContext(),
528                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
529  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
530}
531
532/// Return an expression for LHS /s RHS, if it can be determined and if the
533/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
534/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
535/// the multiplication may overflow, which is useful when the result will be
536/// used in a context where the most significant bits are ignored.
537static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
538                                ScalarEvolution &SE,
539                                bool IgnoreSignificantBits = false) {
540  // Handle the trivial case, which works for any SCEV type.
541  if (LHS == RHS)
542    return SE.getConstant(LHS->getType(), 1);
543
544  // Handle a few RHS special cases.
545  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
546  if (RC) {
547    const APInt &RA = RC->getAPInt();
548    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
549    // some folding.
550    if (RA.isAllOnesValue())
551      return SE.getMulExpr(LHS, RC);
552    // Handle x /s 1 as x.
553    if (RA == 1)
554      return LHS;
555  }
556
557  // Check for a division of a constant by a constant.
558  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
559    if (!RC)
560      return nullptr;
561    const APInt &LA = C->getAPInt();
562    const APInt &RA = RC->getAPInt();
563    if (LA.srem(RA) != 0)
564      return nullptr;
565    return SE.getConstant(LA.sdiv(RA));
566  }
567
568  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
569  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
570    if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
571      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
572                                      IgnoreSignificantBits);
573      if (!Step) return nullptr;
574      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
575                                       IgnoreSignificantBits);
576      if (!Start) return nullptr;
577      // FlagNW is independent of the start value, step direction, and is
578      // preserved with smaller magnitude steps.
579      // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
580      return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
581    }
582    return nullptr;
583  }
584
585  // Distribute the sdiv over add operands, if the add doesn't overflow.
586  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
587    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
588      SmallVector<const SCEV *, 8> Ops;
589      for (const SCEV *S : Add->operands()) {
590        const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
591        if (!Op) return nullptr;
592        Ops.push_back(Op);
593      }
594      return SE.getAddExpr(Ops);
595    }
596    return nullptr;
597  }
598
599  // Check for a multiply operand that we can pull RHS out of.
600  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
601    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
602      SmallVector<const SCEV *, 4> Ops;
603      bool Found = false;
604      for (const SCEV *S : Mul->operands()) {
605        if (!Found)
606          if (const SCEV *Q = getExactSDiv(S, RHS, SE,
607                                           IgnoreSignificantBits)) {
608            S = Q;
609            Found = true;
610          }
611        Ops.push_back(S);
612      }
613      return Found ? SE.getMulExpr(Ops) : nullptr;
614    }
615    return nullptr;
616  }
617
618  // Otherwise we don't know.
619  return nullptr;
620}
621
622/// If S involves the addition of a constant integer value, return that integer
623/// value, and mutate S to point to a new SCEV with that value excluded.
624static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
625  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
626    if (C->getAPInt().getMinSignedBits() <= 64) {
627      S = SE.getConstant(C->getType(), 0);
628      return C->getValue()->getSExtValue();
629    }
630  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
631    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
632    int64_t Result = ExtractImmediate(NewOps.front(), SE);
633    if (Result != 0)
634      S = SE.getAddExpr(NewOps);
635    return Result;
636  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
637    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
638    int64_t Result = ExtractImmediate(NewOps.front(), SE);
639    if (Result != 0)
640      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
641                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
642                           SCEV::FlagAnyWrap);
643    return Result;
644  }
645  return 0;
646}
647
648/// If S involves the addition of a GlobalValue address, return that symbol, and
649/// mutate S to point to a new SCEV with that value excluded.
650static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
651  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
652    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
653      S = SE.getConstant(GV->getType(), 0);
654      return GV;
655    }
656  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
657    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
658    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
659    if (Result)
660      S = SE.getAddExpr(NewOps);
661    return Result;
662  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
663    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
664    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
665    if (Result)
666      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
667                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
668                           SCEV::FlagAnyWrap);
669    return Result;
670  }
671  return nullptr;
672}
673
674/// Returns true if the specified instruction is using the specified value as an
675/// address.
676static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
677  bool isAddress = isa<LoadInst>(Inst);
678  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
679    if (SI->getOperand(1) == OperandVal)
680      isAddress = true;
681  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
682    // Addressing modes can also be folded into prefetches and a variety
683    // of intrinsics.
684    switch (II->getIntrinsicID()) {
685      default: break;
686      case Intrinsic::prefetch:
687      case Intrinsic::x86_sse_storeu_ps:
688      case Intrinsic::x86_sse2_storeu_pd:
689      case Intrinsic::x86_sse2_storeu_dq:
690      case Intrinsic::x86_sse2_storel_dq:
691        if (II->getArgOperand(0) == OperandVal)
692          isAddress = true;
693        break;
694    }
695  }
696  return isAddress;
697}
698
699/// Return the type of the memory being accessed.
700static MemAccessTy getAccessType(const Instruction *Inst) {
701  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
702  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
703    AccessTy.MemTy = SI->getOperand(0)->getType();
704    AccessTy.AddrSpace = SI->getPointerAddressSpace();
705  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
706    AccessTy.AddrSpace = LI->getPointerAddressSpace();
707  } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
708    // Addressing modes can also be folded into prefetches and a variety
709    // of intrinsics.
710    switch (II->getIntrinsicID()) {
711    default: break;
712    case Intrinsic::x86_sse_storeu_ps:
713    case Intrinsic::x86_sse2_storeu_pd:
714    case Intrinsic::x86_sse2_storeu_dq:
715    case Intrinsic::x86_sse2_storel_dq:
716      AccessTy.MemTy = II->getArgOperand(0)->getType();
717      break;
718    }
719  }
720
721  // All pointers have the same requirements, so canonicalize them to an
722  // arbitrary pointer type to minimize variation.
723  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
724    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
725                                      PTy->getAddressSpace());
726
727  return AccessTy;
728}
729
730/// Return true if this AddRec is already a phi in its loop.
731static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
732  for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
733       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
734    if (SE.isSCEVable(PN->getType()) &&
735        (SE.getEffectiveSCEVType(PN->getType()) ==
736         SE.getEffectiveSCEVType(AR->getType())) &&
737        SE.getSCEV(PN) == AR)
738      return true;
739  }
740  return false;
741}
742
743/// Check if expanding this expression is likely to incur significant cost. This
744/// is tricky because SCEV doesn't track which expressions are actually computed
745/// by the current IR.
746///
747/// We currently allow expansion of IV increments that involve adds,
748/// multiplication by constants, and AddRecs from existing phis.
749///
750/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
751/// obvious multiple of the UDivExpr.
752static bool isHighCostExpansion(const SCEV *S,
753                                SmallPtrSetImpl<const SCEV*> &Processed,
754                                ScalarEvolution &SE) {
755  // Zero/One operand expressions
756  switch (S->getSCEVType()) {
757  case scUnknown:
758  case scConstant:
759    return false;
760  case scTruncate:
761    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
762                               Processed, SE);
763  case scZeroExtend:
764    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
765                               Processed, SE);
766  case scSignExtend:
767    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
768                               Processed, SE);
769  }
770
771  if (!Processed.insert(S).second)
772    return false;
773
774  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
775    for (const SCEV *S : Add->operands()) {
776      if (isHighCostExpansion(S, Processed, SE))
777        return true;
778    }
779    return false;
780  }
781
782  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
783    if (Mul->getNumOperands() == 2) {
784      // Multiplication by a constant is ok
785      if (isa<SCEVConstant>(Mul->getOperand(0)))
786        return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
787
788      // If we have the value of one operand, check if an existing
789      // multiplication already generates this expression.
790      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
791        Value *UVal = U->getValue();
792        for (User *UR : UVal->users()) {
793          // If U is a constant, it may be used by a ConstantExpr.
794          Instruction *UI = dyn_cast<Instruction>(UR);
795          if (UI && UI->getOpcode() == Instruction::Mul &&
796              SE.isSCEVable(UI->getType())) {
797            return SE.getSCEV(UI) == Mul;
798          }
799        }
800      }
801    }
802  }
803
804  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
805    if (isExistingPhi(AR, SE))
806      return false;
807  }
808
809  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
810  return true;
811}
812
813/// If any of the instructions is the specified set are trivially dead, delete
814/// them and see if this makes any of their operands subsequently dead.
815static bool
816DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
817  bool Changed = false;
818
819  while (!DeadInsts.empty()) {
820    Value *V = DeadInsts.pop_back_val();
821    Instruction *I = dyn_cast_or_null<Instruction>(V);
822
823    if (!I || !isInstructionTriviallyDead(I))
824      continue;
825
826    for (Use &O : I->operands())
827      if (Instruction *U = dyn_cast<Instruction>(O)) {
828        O = nullptr;
829        if (U->use_empty())
830          DeadInsts.emplace_back(U);
831      }
832
833    I->eraseFromParent();
834    Changed = true;
835  }
836
837  return Changed;
838}
839
840namespace {
841class LSRUse;
842}
843
844/// \brief Check if the addressing mode defined by \p F is completely
845/// folded in \p LU at isel time.
846/// This includes address-mode folding and special icmp tricks.
847/// This function returns true if \p LU can accommodate what \p F
848/// defines and up to 1 base + 1 scaled + offset.
849/// In other words, if \p F has several base registers, this function may
850/// still return true. Therefore, users still need to account for
851/// additional base registers and/or unfolded offsets to derive an
852/// accurate cost model.
853static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
854                                 const LSRUse &LU, const Formula &F);
855// Get the cost of the scaling factor used in F for LU.
856static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
857                                     const LSRUse &LU, const Formula &F);
858
859namespace {
860
861/// This class is used to measure and compare candidate formulae.
862class Cost {
863  /// TODO: Some of these could be merged. Also, a lexical ordering
864  /// isn't always optimal.
865  unsigned NumRegs;
866  unsigned AddRecCost;
867  unsigned NumIVMuls;
868  unsigned NumBaseAdds;
869  unsigned ImmCost;
870  unsigned SetupCost;
871  unsigned ScaleCost;
872
873public:
874  Cost()
875    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
876      SetupCost(0), ScaleCost(0) {}
877
878  bool operator<(const Cost &Other) const;
879
880  void Lose();
881
882#ifndef NDEBUG
883  // Once any of the metrics loses, they must all remain losers.
884  bool isValid() {
885    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
886             | ImmCost | SetupCost | ScaleCost) != ~0u)
887      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
888           & ImmCost & SetupCost & ScaleCost) == ~0u);
889  }
890#endif
891
892  bool isLoser() {
893    assert(isValid() && "invalid cost");
894    return NumRegs == ~0u;
895  }
896
897  void RateFormula(const TargetTransformInfo &TTI,
898                   const Formula &F,
899                   SmallPtrSetImpl<const SCEV *> &Regs,
900                   const DenseSet<const SCEV *> &VisitedRegs,
901                   const Loop *L,
902                   const SmallVectorImpl<int64_t> &Offsets,
903                   ScalarEvolution &SE, DominatorTree &DT,
904                   const LSRUse &LU,
905                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
906
907  void print(raw_ostream &OS) const;
908  void dump() const;
909
910private:
911  void RateRegister(const SCEV *Reg,
912                    SmallPtrSetImpl<const SCEV *> &Regs,
913                    const Loop *L,
914                    ScalarEvolution &SE, DominatorTree &DT);
915  void RatePrimaryRegister(const SCEV *Reg,
916                           SmallPtrSetImpl<const SCEV *> &Regs,
917                           const Loop *L,
918                           ScalarEvolution &SE, DominatorTree &DT,
919                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
920};
921
922}
923
924/// Tally up interesting quantities from the given register.
925void Cost::RateRegister(const SCEV *Reg,
926                        SmallPtrSetImpl<const SCEV *> &Regs,
927                        const Loop *L,
928                        ScalarEvolution &SE, DominatorTree &DT) {
929  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
930    // If this is an addrec for another loop, don't second-guess its addrec phi
931    // nodes. LSR isn't currently smart enough to reason about more than one
932    // loop at a time. LSR has already run on inner loops, will not run on outer
933    // loops, and cannot be expected to change sibling loops.
934    if (AR->getLoop() != L) {
935      // If the AddRec exists, consider it's register free and leave it alone.
936      if (isExistingPhi(AR, SE))
937        return;
938
939      // Otherwise, do not consider this formula at all.
940      Lose();
941      return;
942    }
943    AddRecCost += 1; /// TODO: This should be a function of the stride.
944
945    // Add the step value register, if it needs one.
946    // TODO: The non-affine case isn't precisely modeled here.
947    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
948      if (!Regs.count(AR->getOperand(1))) {
949        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
950        if (isLoser())
951          return;
952      }
953    }
954  }
955  ++NumRegs;
956
957  // Rough heuristic; favor registers which don't require extra setup
958  // instructions in the preheader.
959  if (!isa<SCEVUnknown>(Reg) &&
960      !isa<SCEVConstant>(Reg) &&
961      !(isa<SCEVAddRecExpr>(Reg) &&
962        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
963         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
964    ++SetupCost;
965
966    NumIVMuls += isa<SCEVMulExpr>(Reg) &&
967                 SE.hasComputableLoopEvolution(Reg, L);
968}
969
970/// Record this register in the set. If we haven't seen it before, rate
971/// it. Optional LoserRegs provides a way to declare any formula that refers to
972/// one of those regs an instant loser.
973void Cost::RatePrimaryRegister(const SCEV *Reg,
974                               SmallPtrSetImpl<const SCEV *> &Regs,
975                               const Loop *L,
976                               ScalarEvolution &SE, DominatorTree &DT,
977                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
978  if (LoserRegs && LoserRegs->count(Reg)) {
979    Lose();
980    return;
981  }
982  if (Regs.insert(Reg).second) {
983    RateRegister(Reg, Regs, L, SE, DT);
984    if (LoserRegs && isLoser())
985      LoserRegs->insert(Reg);
986  }
987}
988
989void Cost::RateFormula(const TargetTransformInfo &TTI,
990                       const Formula &F,
991                       SmallPtrSetImpl<const SCEV *> &Regs,
992                       const DenseSet<const SCEV *> &VisitedRegs,
993                       const Loop *L,
994                       const SmallVectorImpl<int64_t> &Offsets,
995                       ScalarEvolution &SE, DominatorTree &DT,
996                       const LSRUse &LU,
997                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
998  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
999  // Tally up the registers.
1000  if (const SCEV *ScaledReg = F.ScaledReg) {
1001    if (VisitedRegs.count(ScaledReg)) {
1002      Lose();
1003      return;
1004    }
1005    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
1006    if (isLoser())
1007      return;
1008  }
1009  for (const SCEV *BaseReg : F.BaseRegs) {
1010    if (VisitedRegs.count(BaseReg)) {
1011      Lose();
1012      return;
1013    }
1014    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
1015    if (isLoser())
1016      return;
1017  }
1018
1019  // Determine how many (unfolded) adds we'll need inside the loop.
1020  size_t NumBaseParts = F.getNumRegs();
1021  if (NumBaseParts > 1)
1022    // Do not count the base and a possible second register if the target
1023    // allows to fold 2 registers.
1024    NumBaseAdds +=
1025        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
1026  NumBaseAdds += (F.UnfoldedOffset != 0);
1027
1028  // Accumulate non-free scaling amounts.
1029  ScaleCost += getScalingFactorCost(TTI, LU, F);
1030
1031  // Tally up the non-zero immediates.
1032  for (int64_t O : Offsets) {
1033    int64_t Offset = (uint64_t)O + F.BaseOffset;
1034    if (F.BaseGV)
1035      ImmCost += 64; // Handle symbolic values conservatively.
1036                     // TODO: This should probably be the pointer size.
1037    else if (Offset != 0)
1038      ImmCost += APInt(64, Offset, true).getMinSignedBits();
1039  }
1040  assert(isValid() && "invalid cost");
1041}
1042
1043/// Set this cost to a losing value.
1044void Cost::Lose() {
1045  NumRegs = ~0u;
1046  AddRecCost = ~0u;
1047  NumIVMuls = ~0u;
1048  NumBaseAdds = ~0u;
1049  ImmCost = ~0u;
1050  SetupCost = ~0u;
1051  ScaleCost = ~0u;
1052}
1053
1054/// Choose the lower cost.
1055bool Cost::operator<(const Cost &Other) const {
1056  return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
1057                  ImmCost, SetupCost) <
1058         std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
1059                  Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
1060                  Other.SetupCost);
1061}
1062
1063void Cost::print(raw_ostream &OS) const {
1064  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
1065  if (AddRecCost != 0)
1066    OS << ", with addrec cost " << AddRecCost;
1067  if (NumIVMuls != 0)
1068    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
1069  if (NumBaseAdds != 0)
1070    OS << ", plus " << NumBaseAdds << " base add"
1071       << (NumBaseAdds == 1 ? "" : "s");
1072  if (ScaleCost != 0)
1073    OS << ", plus " << ScaleCost << " scale cost";
1074  if (ImmCost != 0)
1075    OS << ", plus " << ImmCost << " imm cost";
1076  if (SetupCost != 0)
1077    OS << ", plus " << SetupCost << " setup cost";
1078}
1079
1080LLVM_DUMP_METHOD
1081void Cost::dump() const {
1082  print(errs()); errs() << '\n';
1083}
1084
1085namespace {
1086
1087/// An operand value in an instruction which is to be replaced with some
1088/// equivalent, possibly strength-reduced, replacement.
1089struct LSRFixup {
1090  /// The instruction which will be updated.
1091  Instruction *UserInst;
1092
1093  /// The operand of the instruction which will be replaced. The operand may be
1094  /// used more than once; every instance will be replaced.
1095  Value *OperandValToReplace;
1096
1097  /// If this user is to use the post-incremented value of an induction
1098  /// variable, this variable is non-null and holds the loop associated with the
1099  /// induction variable.
1100  PostIncLoopSet PostIncLoops;
1101
1102  /// The index of the LSRUse describing the expression which this fixup needs,
1103  /// minus an offset (below).
1104  size_t LUIdx;
1105
1106  /// A constant offset to be added to the LSRUse expression.  This allows
1107  /// multiple fixups to share the same LSRUse with different offsets, for
1108  /// example in an unrolled loop.
1109  int64_t Offset;
1110
1111  bool isUseFullyOutsideLoop(const Loop *L) const;
1112
1113  LSRFixup();
1114
1115  void print(raw_ostream &OS) const;
1116  void dump() const;
1117};
1118
1119}
1120
1121LSRFixup::LSRFixup()
1122  : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
1123    Offset(0) {}
1124
1125/// Test whether this fixup always uses its value outside of the given loop.
1126bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1127  // PHI nodes use their value in their incoming blocks.
1128  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1129    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1130      if (PN->getIncomingValue(i) == OperandValToReplace &&
1131          L->contains(PN->getIncomingBlock(i)))
1132        return false;
1133    return true;
1134  }
1135
1136  return !L->contains(UserInst);
1137}
1138
1139void LSRFixup::print(raw_ostream &OS) const {
1140  OS << "UserInst=";
1141  // Store is common and interesting enough to be worth special-casing.
1142  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1143    OS << "store ";
1144    Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1145  } else if (UserInst->getType()->isVoidTy())
1146    OS << UserInst->getOpcodeName();
1147  else
1148    UserInst->printAsOperand(OS, /*PrintType=*/false);
1149
1150  OS << ", OperandValToReplace=";
1151  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1152
1153  for (const Loop *PIL : PostIncLoops) {
1154    OS << ", PostIncLoop=";
1155    PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1156  }
1157
1158  if (LUIdx != ~size_t(0))
1159    OS << ", LUIdx=" << LUIdx;
1160
1161  if (Offset != 0)
1162    OS << ", Offset=" << Offset;
1163}
1164
1165LLVM_DUMP_METHOD
1166void LSRFixup::dump() const {
1167  print(errs()); errs() << '\n';
1168}
1169
1170namespace {
1171
1172/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1173/// SmallVectors of const SCEV*.
1174struct UniquifierDenseMapInfo {
1175  static SmallVector<const SCEV *, 4> getEmptyKey() {
1176    SmallVector<const SCEV *, 4>  V;
1177    V.push_back(reinterpret_cast<const SCEV *>(-1));
1178    return V;
1179  }
1180
1181  static SmallVector<const SCEV *, 4> getTombstoneKey() {
1182    SmallVector<const SCEV *, 4> V;
1183    V.push_back(reinterpret_cast<const SCEV *>(-2));
1184    return V;
1185  }
1186
1187  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1188    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1189  }
1190
1191  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1192                      const SmallVector<const SCEV *, 4> &RHS) {
1193    return LHS == RHS;
1194  }
1195};
1196
1197/// This class holds the state that LSR keeps for each use in IVUsers, as well
1198/// as uses invented by LSR itself. It includes information about what kinds of
1199/// things can be folded into the user, information about the user itself, and
1200/// information about how the use may be satisfied.  TODO: Represent multiple
1201/// users of the same expression in common?
1202class LSRUse {
1203  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1204
1205public:
1206  /// An enum for a kind of use, indicating what types of scaled and immediate
1207  /// operands it might support.
1208  enum KindType {
1209    Basic,   ///< A normal use, with no folding.
1210    Special, ///< A special case of basic, allowing -1 scales.
1211    Address, ///< An address use; folding according to TargetLowering
1212    ICmpZero ///< An equality icmp with both operands folded into one.
1213    // TODO: Add a generic icmp too?
1214  };
1215
1216  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
1217
1218  KindType Kind;
1219  MemAccessTy AccessTy;
1220
1221  SmallVector<int64_t, 8> Offsets;
1222  int64_t MinOffset;
1223  int64_t MaxOffset;
1224
1225  /// This records whether all of the fixups using this LSRUse are outside of
1226  /// the loop, in which case some special-case heuristics may be used.
1227  bool AllFixupsOutsideLoop;
1228
1229  /// RigidFormula is set to true to guarantee that this use will be associated
1230  /// with a single formula--the one that initially matched. Some SCEV
1231  /// expressions cannot be expanded. This allows LSR to consider the registers
1232  /// used by those expressions without the need to expand them later after
1233  /// changing the formula.
1234  bool RigidFormula;
1235
1236  /// This records the widest use type for any fixup using this
1237  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1238  /// fixup widths to be equivalent, because the narrower one may be relying on
1239  /// the implicit truncation to truncate away bogus bits.
1240  Type *WidestFixupType;
1241
1242  /// A list of ways to build a value that can satisfy this user.  After the
1243  /// list is populated, one of these is selected heuristically and used to
1244  /// formulate a replacement for OperandValToReplace in UserInst.
1245  SmallVector<Formula, 12> Formulae;
1246
1247  /// The set of register candidates used by all formulae in this LSRUse.
1248  SmallPtrSet<const SCEV *, 4> Regs;
1249
1250  LSRUse(KindType K, MemAccessTy AT)
1251      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
1252        AllFixupsOutsideLoop(true), RigidFormula(false),
1253        WidestFixupType(nullptr) {}
1254
1255  bool HasFormulaWithSameRegs(const Formula &F) const;
1256  bool InsertFormula(const Formula &F);
1257  void DeleteFormula(Formula &F);
1258  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1259
1260  void print(raw_ostream &OS) const;
1261  void dump() const;
1262};
1263
1264}
1265
1266/// Test whether this use as a formula which has the same registers as the given
1267/// formula.
1268bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1269  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1270  if (F.ScaledReg) Key.push_back(F.ScaledReg);
1271  // Unstable sort by host order ok, because this is only used for uniquifying.
1272  std::sort(Key.begin(), Key.end());
1273  return Uniquifier.count(Key);
1274}
1275
1276/// If the given formula has not yet been inserted, add it to the list, and
1277/// return true. Return false otherwise.  The formula must be in canonical form.
1278bool LSRUse::InsertFormula(const Formula &F) {
1279  assert(F.isCanonical() && "Invalid canonical representation");
1280
1281  if (!Formulae.empty() && RigidFormula)
1282    return false;
1283
1284  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1285  if (F.ScaledReg) Key.push_back(F.ScaledReg);
1286  // Unstable sort by host order ok, because this is only used for uniquifying.
1287  std::sort(Key.begin(), Key.end());
1288
1289  if (!Uniquifier.insert(Key).second)
1290    return false;
1291
1292  // Using a register to hold the value of 0 is not profitable.
1293  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1294         "Zero allocated in a scaled register!");
1295#ifndef NDEBUG
1296  for (const SCEV *BaseReg : F.BaseRegs)
1297    assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1298#endif
1299
1300  // Add the formula to the list.
1301  Formulae.push_back(F);
1302
1303  // Record registers now being used by this use.
1304  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1305  if (F.ScaledReg)
1306    Regs.insert(F.ScaledReg);
1307
1308  return true;
1309}
1310
1311/// Remove the given formula from this use's list.
1312void LSRUse::DeleteFormula(Formula &F) {
1313  if (&F != &Formulae.back())
1314    std::swap(F, Formulae.back());
1315  Formulae.pop_back();
1316}
1317
1318/// Recompute the Regs field, and update RegUses.
1319void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1320  // Now that we've filtered out some formulae, recompute the Regs set.
1321  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1322  Regs.clear();
1323  for (const Formula &F : Formulae) {
1324    if (F.ScaledReg) Regs.insert(F.ScaledReg);
1325    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1326  }
1327
1328  // Update the RegTracker.
1329  for (const SCEV *S : OldRegs)
1330    if (!Regs.count(S))
1331      RegUses.dropRegister(S, LUIdx);
1332}
1333
1334void LSRUse::print(raw_ostream &OS) const {
1335  OS << "LSR Use: Kind=";
1336  switch (Kind) {
1337  case Basic:    OS << "Basic"; break;
1338  case Special:  OS << "Special"; break;
1339  case ICmpZero: OS << "ICmpZero"; break;
1340  case Address:
1341    OS << "Address of ";
1342    if (AccessTy.MemTy->isPointerTy())
1343      OS << "pointer"; // the full pointer type could be really verbose
1344    else {
1345      OS << *AccessTy.MemTy;
1346    }
1347
1348    OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1349  }
1350
1351  OS << ", Offsets={";
1352  bool NeedComma = false;
1353  for (int64_t O : Offsets) {
1354    if (NeedComma) OS << ',';
1355    OS << O;
1356    NeedComma = true;
1357  }
1358  OS << '}';
1359
1360  if (AllFixupsOutsideLoop)
1361    OS << ", all-fixups-outside-loop";
1362
1363  if (WidestFixupType)
1364    OS << ", widest fixup type: " << *WidestFixupType;
1365}
1366
1367LLVM_DUMP_METHOD
1368void LSRUse::dump() const {
1369  print(errs()); errs() << '\n';
1370}
1371
1372static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1373                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
1374                                 GlobalValue *BaseGV, int64_t BaseOffset,
1375                                 bool HasBaseReg, int64_t Scale) {
1376  switch (Kind) {
1377  case LSRUse::Address:
1378    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
1379                                     HasBaseReg, Scale, AccessTy.AddrSpace);
1380
1381  case LSRUse::ICmpZero:
1382    // There's not even a target hook for querying whether it would be legal to
1383    // fold a GV into an ICmp.
1384    if (BaseGV)
1385      return false;
1386
1387    // ICmp only has two operands; don't allow more than two non-trivial parts.
1388    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
1389      return false;
1390
1391    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1392    // putting the scaled register in the other operand of the icmp.
1393    if (Scale != 0 && Scale != -1)
1394      return false;
1395
1396    // If we have low-level target information, ask the target if it can fold an
1397    // integer immediate on an icmp.
1398    if (BaseOffset != 0) {
1399      // We have one of:
1400      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1401      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1402      // Offs is the ICmp immediate.
1403      if (Scale == 0)
1404        // The cast does the right thing with INT64_MIN.
1405        BaseOffset = -(uint64_t)BaseOffset;
1406      return TTI.isLegalICmpImmediate(BaseOffset);
1407    }
1408
1409    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1410    return true;
1411
1412  case LSRUse::Basic:
1413    // Only handle single-register values.
1414    return !BaseGV && Scale == 0 && BaseOffset == 0;
1415
1416  case LSRUse::Special:
1417    // Special case Basic to handle -1 scales.
1418    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
1419  }
1420
1421  llvm_unreachable("Invalid LSRUse Kind!");
1422}
1423
1424static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1425                                 int64_t MinOffset, int64_t MaxOffset,
1426                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
1427                                 GlobalValue *BaseGV, int64_t BaseOffset,
1428                                 bool HasBaseReg, int64_t Scale) {
1429  // Check for overflow.
1430  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
1431      (MinOffset > 0))
1432    return false;
1433  MinOffset = (uint64_t)BaseOffset + MinOffset;
1434  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
1435      (MaxOffset > 0))
1436    return false;
1437  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
1438
1439  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1440                              HasBaseReg, Scale) &&
1441         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1442                              HasBaseReg, Scale);
1443}
1444
1445static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1446                                 int64_t MinOffset, int64_t MaxOffset,
1447                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
1448                                 const Formula &F) {
1449  // For the purpose of isAMCompletelyFolded either having a canonical formula
1450  // or a scale not equal to zero is correct.
1451  // Problems may arise from non canonical formulae having a scale == 0.
1452  // Strictly speaking it would best to just rely on canonical formulae.
1453  // However, when we generate the scaled formulae, we first check that the
1454  // scaling factor is profitable before computing the actual ScaledReg for
1455  // compile time sake.
1456  assert((F.isCanonical() || F.Scale != 0));
1457  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1458                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1459}
1460
1461/// Test whether we know how to expand the current formula.
1462static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1463                       int64_t MaxOffset, LSRUse::KindType Kind,
1464                       MemAccessTy AccessTy, GlobalValue *BaseGV,
1465                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
1466  // We know how to expand completely foldable formulae.
1467  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1468                              BaseOffset, HasBaseReg, Scale) ||
1469         // Or formulae that use a base register produced by a sum of base
1470         // registers.
1471         (Scale == 1 &&
1472          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1473                               BaseGV, BaseOffset, true, 0));
1474}
1475
1476static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1477                       int64_t MaxOffset, LSRUse::KindType Kind,
1478                       MemAccessTy AccessTy, const Formula &F) {
1479  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1480                    F.BaseOffset, F.HasBaseReg, F.Scale);
1481}
1482
1483static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1484                                 const LSRUse &LU, const Formula &F) {
1485  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1486                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1487                              F.Scale);
1488}
1489
1490static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
1491                                     const LSRUse &LU, const Formula &F) {
1492  if (!F.Scale)
1493    return 0;
1494
1495  // If the use is not completely folded in that instruction, we will have to
1496  // pay an extra cost only for scale != 1.
1497  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1498                            LU.AccessTy, F))
1499    return F.Scale != 1;
1500
1501  switch (LU.Kind) {
1502  case LSRUse::Address: {
1503    // Check the scaling factor cost with both the min and max offsets.
1504    int ScaleCostMinOffset = TTI.getScalingFactorCost(
1505        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
1506        F.Scale, LU.AccessTy.AddrSpace);
1507    int ScaleCostMaxOffset = TTI.getScalingFactorCost(
1508        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
1509        F.Scale, LU.AccessTy.AddrSpace);
1510
1511    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
1512           "Legal addressing mode has an illegal cost!");
1513    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1514  }
1515  case LSRUse::ICmpZero:
1516  case LSRUse::Basic:
1517  case LSRUse::Special:
1518    // The use is completely folded, i.e., everything is folded into the
1519    // instruction.
1520    return 0;
1521  }
1522
1523  llvm_unreachable("Invalid LSRUse Kind!");
1524}
1525
1526static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1527                             LSRUse::KindType Kind, MemAccessTy AccessTy,
1528                             GlobalValue *BaseGV, int64_t BaseOffset,
1529                             bool HasBaseReg) {
1530  // Fast-path: zero is always foldable.
1531  if (BaseOffset == 0 && !BaseGV) return true;
1532
1533  // Conservatively, create an address with an immediate and a
1534  // base and a scale.
1535  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1536
1537  // Canonicalize a scale of 1 to a base register if the formula doesn't
1538  // already have a base register.
1539  if (!HasBaseReg && Scale == 1) {
1540    Scale = 0;
1541    HasBaseReg = true;
1542  }
1543
1544  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
1545                              HasBaseReg, Scale);
1546}
1547
1548static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1549                             ScalarEvolution &SE, int64_t MinOffset,
1550                             int64_t MaxOffset, LSRUse::KindType Kind,
1551                             MemAccessTy AccessTy, const SCEV *S,
1552                             bool HasBaseReg) {
1553  // Fast-path: zero is always foldable.
1554  if (S->isZero()) return true;
1555
1556  // Conservatively, create an address with an immediate and a
1557  // base and a scale.
1558  int64_t BaseOffset = ExtractImmediate(S, SE);
1559  GlobalValue *BaseGV = ExtractSymbol(S, SE);
1560
1561  // If there's anything else involved, it's not foldable.
1562  if (!S->isZero()) return false;
1563
1564  // Fast-path: zero is always foldable.
1565  if (BaseOffset == 0 && !BaseGV) return true;
1566
1567  // Conservatively, create an address with an immediate and a
1568  // base and a scale.
1569  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1570
1571  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1572                              BaseOffset, HasBaseReg, Scale);
1573}
1574
1575namespace {
1576
1577/// An individual increment in a Chain of IV increments.  Relate an IV user to
1578/// an expression that computes the IV it uses from the IV used by the previous
1579/// link in the Chain.
1580///
1581/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1582/// original IVOperand. The head of the chain's IVOperand is only valid during
1583/// chain collection, before LSR replaces IV users. During chain generation,
1584/// IncExpr can be used to find the new IVOperand that computes the same
1585/// expression.
1586struct IVInc {
1587  Instruction *UserInst;
1588  Value* IVOperand;
1589  const SCEV *IncExpr;
1590
1591  IVInc(Instruction *U, Value *O, const SCEV *E):
1592    UserInst(U), IVOperand(O), IncExpr(E) {}
1593};
1594
1595// The list of IV increments in program order.  We typically add the head of a
1596// chain without finding subsequent links.
1597struct IVChain {
1598  SmallVector<IVInc,1> Incs;
1599  const SCEV *ExprBase;
1600
1601  IVChain() : ExprBase(nullptr) {}
1602
1603  IVChain(const IVInc &Head, const SCEV *Base)
1604    : Incs(1, Head), ExprBase(Base) {}
1605
1606  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
1607
1608  // Return the first increment in the chain.
1609  const_iterator begin() const {
1610    assert(!Incs.empty());
1611    return std::next(Incs.begin());
1612  }
1613  const_iterator end() const {
1614    return Incs.end();
1615  }
1616
1617  // Returns true if this chain contains any increments.
1618  bool hasIncs() const { return Incs.size() >= 2; }
1619
1620  // Add an IVInc to the end of this chain.
1621  void add(const IVInc &X) { Incs.push_back(X); }
1622
1623  // Returns the last UserInst in the chain.
1624  Instruction *tailUserInst() const { return Incs.back().UserInst; }
1625
1626  // Returns true if IncExpr can be profitably added to this chain.
1627  bool isProfitableIncrement(const SCEV *OperExpr,
1628                             const SCEV *IncExpr,
1629                             ScalarEvolution&);
1630};
1631
1632/// Helper for CollectChains to track multiple IV increment uses.  Distinguish
1633/// between FarUsers that definitely cross IV increments and NearUsers that may
1634/// be used between IV increments.
1635struct ChainUsers {
1636  SmallPtrSet<Instruction*, 4> FarUsers;
1637  SmallPtrSet<Instruction*, 4> NearUsers;
1638};
1639
1640/// This class holds state for the main loop strength reduction logic.
1641class LSRInstance {
1642  IVUsers &IU;
1643  ScalarEvolution &SE;
1644  DominatorTree &DT;
1645  LoopInfo &LI;
1646  const TargetTransformInfo &TTI;
1647  Loop *const L;
1648  bool Changed;
1649
1650  /// This is the insert position that the current loop's induction variable
1651  /// increment should be placed. In simple loops, this is the latch block's
1652  /// terminator. But in more complicated cases, this is a position which will
1653  /// dominate all the in-loop post-increment users.
1654  Instruction *IVIncInsertPos;
1655
1656  /// Interesting factors between use strides.
1657  SmallSetVector<int64_t, 8> Factors;
1658
1659  /// Interesting use types, to facilitate truncation reuse.
1660  SmallSetVector<Type *, 4> Types;
1661
1662  /// The list of operands which are to be replaced.
1663  SmallVector<LSRFixup, 16> Fixups;
1664
1665  /// The list of interesting uses.
1666  SmallVector<LSRUse, 16> Uses;
1667
1668  /// Track which uses use which register candidates.
1669  RegUseTracker RegUses;
1670
1671  // Limit the number of chains to avoid quadratic behavior. We don't expect to
1672  // have more than a few IV increment chains in a loop. Missing a Chain falls
1673  // back to normal LSR behavior for those uses.
1674  static const unsigned MaxChains = 8;
1675
1676  /// IV users can form a chain of IV increments.
1677  SmallVector<IVChain, MaxChains> IVChainVec;
1678
1679  /// IV users that belong to profitable IVChains.
1680  SmallPtrSet<Use*, MaxChains> IVIncSet;
1681
1682  void OptimizeShadowIV();
1683  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1684  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1685  void OptimizeLoopTermCond();
1686
1687  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
1688                        SmallVectorImpl<ChainUsers> &ChainUsersVec);
1689  void FinalizeChain(IVChain &Chain);
1690  void CollectChains();
1691  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
1692                       SmallVectorImpl<WeakVH> &DeadInsts);
1693
1694  void CollectInterestingTypesAndFactors();
1695  void CollectFixupsAndInitialFormulae();
1696
1697  LSRFixup &getNewFixup() {
1698    Fixups.push_back(LSRFixup());
1699    return Fixups.back();
1700  }
1701
1702  // Support for sharing of LSRUses between LSRFixups.
1703  typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
1704  UseMapTy UseMap;
1705
1706  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1707                          LSRUse::KindType Kind, MemAccessTy AccessTy);
1708
1709  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
1710                                    MemAccessTy AccessTy);
1711
1712  void DeleteUse(LSRUse &LU, size_t LUIdx);
1713
1714  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1715
1716  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1717  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1718  void CountRegisters(const Formula &F, size_t LUIdx);
1719  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1720
1721  void CollectLoopInvariantFixupsAndFormulae();
1722
1723  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1724                              unsigned Depth = 0);
1725
1726  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
1727                                  const Formula &Base, unsigned Depth,
1728                                  size_t Idx, bool IsScaledReg = false);
1729  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1730  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
1731                                   const Formula &Base, size_t Idx,
1732                                   bool IsScaledReg = false);
1733  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1734  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
1735                                   const Formula &Base,
1736                                   const SmallVectorImpl<int64_t> &Worklist,
1737                                   size_t Idx, bool IsScaledReg = false);
1738  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1739  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1740  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1741  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1742  void GenerateCrossUseConstantOffsets();
1743  void GenerateAllReuseFormulae();
1744
1745  void FilterOutUndesirableDedicatedRegisters();
1746
1747  size_t EstimateSearchSpaceComplexity() const;
1748  void NarrowSearchSpaceByDetectingSupersets();
1749  void NarrowSearchSpaceByCollapsingUnrolledCode();
1750  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
1751  void NarrowSearchSpaceByPickingWinnerRegs();
1752  void NarrowSearchSpaceUsingHeuristics();
1753
1754  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1755                    Cost &SolutionCost,
1756                    SmallVectorImpl<const Formula *> &Workspace,
1757                    const Cost &CurCost,
1758                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
1759                    DenseSet<const SCEV *> &VisitedRegs) const;
1760  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1761
1762  BasicBlock::iterator
1763    HoistInsertPosition(BasicBlock::iterator IP,
1764                        const SmallVectorImpl<Instruction *> &Inputs) const;
1765  BasicBlock::iterator
1766    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
1767                                  const LSRFixup &LF,
1768                                  const LSRUse &LU,
1769                                  SCEVExpander &Rewriter) const;
1770
1771  Value *Expand(const LSRFixup &LF,
1772                const Formula &F,
1773                BasicBlock::iterator IP,
1774                SCEVExpander &Rewriter,
1775                SmallVectorImpl<WeakVH> &DeadInsts) const;
1776  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
1777                     const Formula &F,
1778                     SCEVExpander &Rewriter,
1779                     SmallVectorImpl<WeakVH> &DeadInsts) const;
1780  void Rewrite(const LSRFixup &LF,
1781               const Formula &F,
1782               SCEVExpander &Rewriter,
1783               SmallVectorImpl<WeakVH> &DeadInsts) const;
1784  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
1785
1786public:
1787  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
1788              LoopInfo &LI, const TargetTransformInfo &TTI);
1789
1790  bool getChanged() const { return Changed; }
1791
1792  void print_factors_and_types(raw_ostream &OS) const;
1793  void print_fixups(raw_ostream &OS) const;
1794  void print_uses(raw_ostream &OS) const;
1795  void print(raw_ostream &OS) const;
1796  void dump() const;
1797};
1798
1799}
1800
1801/// If IV is used in a int-to-float cast inside the loop then try to eliminate
1802/// the cast operation.
1803void LSRInstance::OptimizeShadowIV() {
1804  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1805  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1806    return;
1807
1808  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1809       UI != E; /* empty */) {
1810    IVUsers::const_iterator CandidateUI = UI;
1811    ++UI;
1812    Instruction *ShadowUse = CandidateUI->getUser();
1813    Type *DestTy = nullptr;
1814    bool IsSigned = false;
1815
1816    /* If shadow use is a int->float cast then insert a second IV
1817       to eliminate this cast.
1818
1819         for (unsigned i = 0; i < n; ++i)
1820           foo((double)i);
1821
1822       is transformed into
1823
1824         double d = 0.0;
1825         for (unsigned i = 0; i < n; ++i, ++d)
1826           foo(d);
1827    */
1828    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
1829      IsSigned = false;
1830      DestTy = UCast->getDestTy();
1831    }
1832    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
1833      IsSigned = true;
1834      DestTy = SCast->getDestTy();
1835    }
1836    if (!DestTy) continue;
1837
1838    // If target does not support DestTy natively then do not apply
1839    // this transformation.
1840    if (!TTI.isTypeLegal(DestTy)) continue;
1841
1842    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1843    if (!PH) continue;
1844    if (PH->getNumIncomingValues() != 2) continue;
1845
1846    Type *SrcTy = PH->getType();
1847    int Mantissa = DestTy->getFPMantissaWidth();
1848    if (Mantissa == -1) continue;
1849    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1850      continue;
1851
1852    unsigned Entry, Latch;
1853    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1854      Entry = 0;
1855      Latch = 1;
1856    } else {
1857      Entry = 1;
1858      Latch = 0;
1859    }
1860
1861    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1862    if (!Init) continue;
1863    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
1864                                        (double)Init->getSExtValue() :
1865                                        (double)Init->getZExtValue());
1866
1867    BinaryOperator *Incr =
1868      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1869    if (!Incr) continue;
1870    if (Incr->getOpcode() != Instruction::Add
1871        && Incr->getOpcode() != Instruction::Sub)
1872      continue;
1873
1874    /* Initialize new IV, double d = 0.0 in above example. */
1875    ConstantInt *C = nullptr;
1876    if (Incr->getOperand(0) == PH)
1877      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1878    else if (Incr->getOperand(1) == PH)
1879      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1880    else
1881      continue;
1882
1883    if (!C) continue;
1884
1885    // Ignore negative constants, as the code below doesn't handle them
1886    // correctly. TODO: Remove this restriction.
1887    if (!C->getValue().isStrictlyPositive()) continue;
1888
1889    /* Add new PHINode. */
1890    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
1891
1892    /* create new increment. '++d' in above example. */
1893    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1894    BinaryOperator *NewIncr =
1895      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1896                               Instruction::FAdd : Instruction::FSub,
1897                             NewPH, CFP, "IV.S.next.", Incr);
1898
1899    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1900    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1901
1902    /* Remove cast operation */
1903    ShadowUse->replaceAllUsesWith(NewPH);
1904    ShadowUse->eraseFromParent();
1905    Changed = true;
1906    break;
1907  }
1908}
1909
1910/// If Cond has an operand that is an expression of an IV, set the IV user and
1911/// stride information and return true, otherwise return false.
1912bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
1913  for (IVStrideUse &U : IU)
1914    if (U.getUser() == Cond) {
1915      // NOTE: we could handle setcc instructions with multiple uses here, but
1916      // InstCombine does it as well for simple uses, it's not clear that it
1917      // occurs enough in real life to handle.
1918      CondUse = &U;
1919      return true;
1920    }
1921  return false;
1922}
1923
1924/// Rewrite the loop's terminating condition if it uses a max computation.
1925///
1926/// This is a narrow solution to a specific, but acute, problem. For loops
1927/// like this:
1928///
1929///   i = 0;
1930///   do {
1931///     p[i] = 0.0;
1932///   } while (++i < n);
1933///
1934/// the trip count isn't just 'n', because 'n' might not be positive. And
1935/// unfortunately this can come up even for loops where the user didn't use
1936/// a C do-while loop. For example, seemingly well-behaved top-test loops
1937/// will commonly be lowered like this:
1938//
1939///   if (n > 0) {
1940///     i = 0;
1941///     do {
1942///       p[i] = 0.0;
1943///     } while (++i < n);
1944///   }
1945///
1946/// and then it's possible for subsequent optimization to obscure the if
1947/// test in such a way that indvars can't find it.
1948///
1949/// When indvars can't find the if test in loops like this, it creates a
1950/// max expression, which allows it to give the loop a canonical
1951/// induction variable:
1952///
1953///   i = 0;
1954///   max = n < 1 ? 1 : n;
1955///   do {
1956///     p[i] = 0.0;
1957///   } while (++i != max);
1958///
1959/// Canonical induction variables are necessary because the loop passes
1960/// are designed around them. The most obvious example of this is the
1961/// LoopInfo analysis, which doesn't remember trip count values. It
1962/// expects to be able to rediscover the trip count each time it is
1963/// needed, and it does this using a simple analysis that only succeeds if
1964/// the loop has a canonical induction variable.
1965///
1966/// However, when it comes time to generate code, the maximum operation
1967/// can be quite costly, especially if it's inside of an outer loop.
1968///
1969/// This function solves this problem by detecting this type of loop and
1970/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1971/// the instructions for the maximum computation.
1972///
1973ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1974  // Check that the loop matches the pattern we're looking for.
1975  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1976      Cond->getPredicate() != CmpInst::ICMP_NE)
1977    return Cond;
1978
1979  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1980  if (!Sel || !Sel->hasOneUse()) return Cond;
1981
1982  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1983  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1984    return Cond;
1985  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
1986
1987  // Add one to the backedge-taken count to get the trip count.
1988  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
1989  if (IterationCount != SE.getSCEV(Sel)) return Cond;
1990
1991  // Check for a max calculation that matches the pattern. There's no check
1992  // for ICMP_ULE here because the comparison would be with zero, which
1993  // isn't interesting.
1994  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
1995  const SCEVNAryExpr *Max = nullptr;
1996  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
1997    Pred = ICmpInst::ICMP_SLE;
1998    Max = S;
1999  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2000    Pred = ICmpInst::ICMP_SLT;
2001    Max = S;
2002  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2003    Pred = ICmpInst::ICMP_ULT;
2004    Max = U;
2005  } else {
2006    // No match; bail.
2007    return Cond;
2008  }
2009
2010  // To handle a max with more than two operands, this optimization would
2011  // require additional checking and setup.
2012  if (Max->getNumOperands() != 2)
2013    return Cond;
2014
2015  const SCEV *MaxLHS = Max->getOperand(0);
2016  const SCEV *MaxRHS = Max->getOperand(1);
2017
2018  // ScalarEvolution canonicalizes constants to the left. For < and >, look
2019  // for a comparison with 1. For <= and >=, a comparison with zero.
2020  if (!MaxLHS ||
2021      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2022    return Cond;
2023
2024  // Check the relevant induction variable for conformance to
2025  // the pattern.
2026  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2027  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2028  if (!AR || !AR->isAffine() ||
2029      AR->getStart() != One ||
2030      AR->getStepRecurrence(SE) != One)
2031    return Cond;
2032
2033  assert(AR->getLoop() == L &&
2034         "Loop condition operand is an addrec in a different loop!");
2035
2036  // Check the right operand of the select, and remember it, as it will
2037  // be used in the new comparison instruction.
2038  Value *NewRHS = nullptr;
2039  if (ICmpInst::isTrueWhenEqual(Pred)) {
2040    // Look for n+1, and grab n.
2041    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2042      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2043         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2044           NewRHS = BO->getOperand(0);
2045    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2046      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2047        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2048          NewRHS = BO->getOperand(0);
2049    if (!NewRHS)
2050      return Cond;
2051  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2052    NewRHS = Sel->getOperand(1);
2053  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2054    NewRHS = Sel->getOperand(2);
2055  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2056    NewRHS = SU->getValue();
2057  else
2058    // Max doesn't match expected pattern.
2059    return Cond;
2060
2061  // Determine the new comparison opcode. It may be signed or unsigned,
2062  // and the original comparison may be either equality or inequality.
2063  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2064    Pred = CmpInst::getInversePredicate(Pred);
2065
2066  // Ok, everything looks ok to change the condition into an SLT or SGE and
2067  // delete the max calculation.
2068  ICmpInst *NewCond =
2069    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
2070
2071  // Delete the max calculation instructions.
2072  Cond->replaceAllUsesWith(NewCond);
2073  CondUse->setUser(NewCond);
2074  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2075  Cond->eraseFromParent();
2076  Sel->eraseFromParent();
2077  if (Cmp->use_empty())
2078    Cmp->eraseFromParent();
2079  return NewCond;
2080}
2081
2082/// Change loop terminating condition to use the postinc iv when possible.
2083void
2084LSRInstance::OptimizeLoopTermCond() {
2085  SmallPtrSet<Instruction *, 4> PostIncs;
2086
2087  BasicBlock *LatchBlock = L->getLoopLatch();
2088  SmallVector<BasicBlock*, 8> ExitingBlocks;
2089  L->getExitingBlocks(ExitingBlocks);
2090
2091  for (BasicBlock *ExitingBlock : ExitingBlocks) {
2092
2093    // Get the terminating condition for the loop if possible.  If we
2094    // can, we want to change it to use a post-incremented version of its
2095    // induction variable, to allow coalescing the live ranges for the IV into
2096    // one register value.
2097
2098    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2099    if (!TermBr)
2100      continue;
2101    // FIXME: Overly conservative, termination condition could be an 'or' etc..
2102    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2103      continue;
2104
2105    // Search IVUsesByStride to find Cond's IVUse if there is one.
2106    IVStrideUse *CondUse = nullptr;
2107    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2108    if (!FindIVUserForCond(Cond, CondUse))
2109      continue;
2110
2111    // If the trip count is computed in terms of a max (due to ScalarEvolution
2112    // being unable to find a sufficient guard, for example), change the loop
2113    // comparison to use SLT or ULT instead of NE.
2114    // One consequence of doing this now is that it disrupts the count-down
2115    // optimization. That's not always a bad thing though, because in such
2116    // cases it may still be worthwhile to avoid a max.
2117    Cond = OptimizeMax(Cond, CondUse);
2118
2119    // If this exiting block dominates the latch block, it may also use
2120    // the post-inc value if it won't be shared with other uses.
2121    // Check for dominance.
2122    if (!DT.dominates(ExitingBlock, LatchBlock))
2123      continue;
2124
2125    // Conservatively avoid trying to use the post-inc value in non-latch
2126    // exits if there may be pre-inc users in intervening blocks.
2127    if (LatchBlock != ExitingBlock)
2128      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2129        // Test if the use is reachable from the exiting block. This dominator
2130        // query is a conservative approximation of reachability.
2131        if (&*UI != CondUse &&
2132            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2133          // Conservatively assume there may be reuse if the quotient of their
2134          // strides could be a legal scale.
2135          const SCEV *A = IU.getStride(*CondUse, L);
2136          const SCEV *B = IU.getStride(*UI, L);
2137          if (!A || !B) continue;
2138          if (SE.getTypeSizeInBits(A->getType()) !=
2139              SE.getTypeSizeInBits(B->getType())) {
2140            if (SE.getTypeSizeInBits(A->getType()) >
2141                SE.getTypeSizeInBits(B->getType()))
2142              B = SE.getSignExtendExpr(B, A->getType());
2143            else
2144              A = SE.getSignExtendExpr(A, B->getType());
2145          }
2146          if (const SCEVConstant *D =
2147                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2148            const ConstantInt *C = D->getValue();
2149            // Stride of one or negative one can have reuse with non-addresses.
2150            if (C->isOne() || C->isAllOnesValue())
2151              goto decline_post_inc;
2152            // Avoid weird situations.
2153            if (C->getValue().getMinSignedBits() >= 64 ||
2154                C->getValue().isMinSignedValue())
2155              goto decline_post_inc;
2156            // Check for possible scaled-address reuse.
2157            MemAccessTy AccessTy = getAccessType(UI->getUser());
2158            int64_t Scale = C->getSExtValue();
2159            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2160                                          /*BaseOffset=*/0,
2161                                          /*HasBaseReg=*/false, Scale,
2162                                          AccessTy.AddrSpace))
2163              goto decline_post_inc;
2164            Scale = -Scale;
2165            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2166                                          /*BaseOffset=*/0,
2167                                          /*HasBaseReg=*/false, Scale,
2168                                          AccessTy.AddrSpace))
2169              goto decline_post_inc;
2170          }
2171        }
2172
2173    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
2174                 << *Cond << '\n');
2175
2176    // It's possible for the setcc instruction to be anywhere in the loop, and
2177    // possible for it to have multiple users.  If it is not immediately before
2178    // the exiting block branch, move it.
2179    if (&*++BasicBlock::iterator(Cond) != TermBr) {
2180      if (Cond->hasOneUse()) {
2181        Cond->moveBefore(TermBr);
2182      } else {
2183        // Clone the terminating condition and insert into the loopend.
2184        ICmpInst *OldCond = Cond;
2185        Cond = cast<ICmpInst>(Cond->clone());
2186        Cond->setName(L->getHeader()->getName() + ".termcond");
2187        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
2188
2189        // Clone the IVUse, as the old use still exists!
2190        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2191        TermBr->replaceUsesOfWith(OldCond, Cond);
2192      }
2193    }
2194
2195    // If we get to here, we know that we can transform the setcc instruction to
2196    // use the post-incremented version of the IV, allowing us to coalesce the
2197    // live ranges for the IV correctly.
2198    CondUse->transformToPostInc(L);
2199    Changed = true;
2200
2201    PostIncs.insert(Cond);
2202  decline_post_inc:;
2203  }
2204
2205  // Determine an insertion point for the loop induction variable increment. It
2206  // must dominate all the post-inc comparisons we just set up, and it must
2207  // dominate the loop latch edge.
2208  IVIncInsertPos = L->getLoopLatch()->getTerminator();
2209  for (Instruction *Inst : PostIncs) {
2210    BasicBlock *BB =
2211      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
2212                                    Inst->getParent());
2213    if (BB == Inst->getParent())
2214      IVIncInsertPos = Inst;
2215    else if (BB != IVIncInsertPos->getParent())
2216      IVIncInsertPos = BB->getTerminator();
2217  }
2218}
2219
2220/// Determine if the given use can accommodate a fixup at the given offset and
2221/// other details. If so, update the use and return true.
2222bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
2223                                     bool HasBaseReg, LSRUse::KindType Kind,
2224                                     MemAccessTy AccessTy) {
2225  int64_t NewMinOffset = LU.MinOffset;
2226  int64_t NewMaxOffset = LU.MaxOffset;
2227  MemAccessTy NewAccessTy = AccessTy;
2228
2229  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2230  // something conservative, however this can pessimize in the case that one of
2231  // the uses will have all its uses outside the loop, for example.
2232  if (LU.Kind != Kind)
2233    return false;
2234
2235  // Check for a mismatched access type, and fall back conservatively as needed.
2236  // TODO: Be less conservative when the type is similar and can use the same
2237  // addressing modes.
2238  if (Kind == LSRUse::Address) {
2239    if (AccessTy != LU.AccessTy)
2240      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext());
2241  }
2242
2243  // Conservatively assume HasBaseReg is true for now.
2244  if (NewOffset < LU.MinOffset) {
2245    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2246                          LU.MaxOffset - NewOffset, HasBaseReg))
2247      return false;
2248    NewMinOffset = NewOffset;
2249  } else if (NewOffset > LU.MaxOffset) {
2250    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2251                          NewOffset - LU.MinOffset, HasBaseReg))
2252      return false;
2253    NewMaxOffset = NewOffset;
2254  }
2255
2256  // Update the use.
2257  LU.MinOffset = NewMinOffset;
2258  LU.MaxOffset = NewMaxOffset;
2259  LU.AccessTy = NewAccessTy;
2260  if (NewOffset != LU.Offsets.back())
2261    LU.Offsets.push_back(NewOffset);
2262  return true;
2263}
2264
2265/// Return an LSRUse index and an offset value for a fixup which needs the given
2266/// expression, with the given kind and optional access type.  Either reuse an
2267/// existing use or create a new one, as needed.
2268std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
2269                                               LSRUse::KindType Kind,
2270                                               MemAccessTy AccessTy) {
2271  const SCEV *Copy = Expr;
2272  int64_t Offset = ExtractImmediate(Expr, SE);
2273
2274  // Basic uses can't accept any offset, for example.
2275  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2276                        Offset, /*HasBaseReg=*/ true)) {
2277    Expr = Copy;
2278    Offset = 0;
2279  }
2280
2281  std::pair<UseMapTy::iterator, bool> P =
2282    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2283  if (!P.second) {
2284    // A use already existed with this base.
2285    size_t LUIdx = P.first->second;
2286    LSRUse &LU = Uses[LUIdx];
2287    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2288      // Reuse this use.
2289      return std::make_pair(LUIdx, Offset);
2290  }
2291
2292  // Create a new use.
2293  size_t LUIdx = Uses.size();
2294  P.first->second = LUIdx;
2295  Uses.push_back(LSRUse(Kind, AccessTy));
2296  LSRUse &LU = Uses[LUIdx];
2297
2298  // We don't need to track redundant offsets, but we don't need to go out
2299  // of our way here to avoid them.
2300  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
2301    LU.Offsets.push_back(Offset);
2302
2303  LU.MinOffset = Offset;
2304  LU.MaxOffset = Offset;
2305  return std::make_pair(LUIdx, Offset);
2306}
2307
2308/// Delete the given use from the Uses list.
2309void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2310  if (&LU != &Uses.back())
2311    std::swap(LU, Uses.back());
2312  Uses.pop_back();
2313
2314  // Update RegUses.
2315  RegUses.swapAndDropUse(LUIdx, Uses.size());
2316}
2317
2318/// Look for a use distinct from OrigLU which is has a formula that has the same
2319/// registers as the given formula.
2320LSRUse *
2321LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2322                                       const LSRUse &OrigLU) {
2323  // Search all uses for the formula. This could be more clever.
2324  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2325    LSRUse &LU = Uses[LUIdx];
2326    // Check whether this use is close enough to OrigLU, to see whether it's
2327    // worthwhile looking through its formulae.
2328    // Ignore ICmpZero uses because they may contain formulae generated by
2329    // GenerateICmpZeroScales, in which case adding fixup offsets may
2330    // be invalid.
2331    if (&LU != &OrigLU &&
2332        LU.Kind != LSRUse::ICmpZero &&
2333        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2334        LU.WidestFixupType == OrigLU.WidestFixupType &&
2335        LU.HasFormulaWithSameRegs(OrigF)) {
2336      // Scan through this use's formulae.
2337      for (const Formula &F : LU.Formulae) {
2338        // Check to see if this formula has the same registers and symbols
2339        // as OrigF.
2340        if (F.BaseRegs == OrigF.BaseRegs &&
2341            F.ScaledReg == OrigF.ScaledReg &&
2342            F.BaseGV == OrigF.BaseGV &&
2343            F.Scale == OrigF.Scale &&
2344            F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2345          if (F.BaseOffset == 0)
2346            return &LU;
2347          // This is the formula where all the registers and symbols matched;
2348          // there aren't going to be any others. Since we declined it, we
2349          // can skip the rest of the formulae and proceed to the next LSRUse.
2350          break;
2351        }
2352      }
2353    }
2354  }
2355
2356  // Nothing looked good.
2357  return nullptr;
2358}
2359
2360void LSRInstance::CollectInterestingTypesAndFactors() {
2361  SmallSetVector<const SCEV *, 4> Strides;
2362
2363  // Collect interesting types and strides.
2364  SmallVector<const SCEV *, 4> Worklist;
2365  for (const IVStrideUse &U : IU) {
2366    const SCEV *Expr = IU.getExpr(U);
2367
2368    // Collect interesting types.
2369    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2370
2371    // Add strides for mentioned loops.
2372    Worklist.push_back(Expr);
2373    do {
2374      const SCEV *S = Worklist.pop_back_val();
2375      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2376        if (AR->getLoop() == L)
2377          Strides.insert(AR->getStepRecurrence(SE));
2378        Worklist.push_back(AR->getStart());
2379      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2380        Worklist.append(Add->op_begin(), Add->op_end());
2381      }
2382    } while (!Worklist.empty());
2383  }
2384
2385  // Compute interesting factors from the set of interesting strides.
2386  for (SmallSetVector<const SCEV *, 4>::const_iterator
2387       I = Strides.begin(), E = Strides.end(); I != E; ++I)
2388    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2389         std::next(I); NewStrideIter != E; ++NewStrideIter) {
2390      const SCEV *OldStride = *I;
2391      const SCEV *NewStride = *NewStrideIter;
2392
2393      if (SE.getTypeSizeInBits(OldStride->getType()) !=
2394          SE.getTypeSizeInBits(NewStride->getType())) {
2395        if (SE.getTypeSizeInBits(OldStride->getType()) >
2396            SE.getTypeSizeInBits(NewStride->getType()))
2397          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2398        else
2399          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2400      }
2401      if (const SCEVConstant *Factor =
2402            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2403                                                        SE, true))) {
2404        if (Factor->getAPInt().getMinSignedBits() <= 64)
2405          Factors.insert(Factor->getAPInt().getSExtValue());
2406      } else if (const SCEVConstant *Factor =
2407                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2408                                                               NewStride,
2409                                                               SE, true))) {
2410        if (Factor->getAPInt().getMinSignedBits() <= 64)
2411          Factors.insert(Factor->getAPInt().getSExtValue());
2412      }
2413    }
2414
2415  // If all uses use the same type, don't bother looking for truncation-based
2416  // reuse.
2417  if (Types.size() == 1)
2418    Types.clear();
2419
2420  DEBUG(print_factors_and_types(dbgs()));
2421}
2422
2423/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2424/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2425/// IVStrideUses, we could partially skip this.
2426static User::op_iterator
2427findIVOperand(User::op_iterator OI, User::op_iterator OE,
2428              Loop *L, ScalarEvolution &SE) {
2429  for(; OI != OE; ++OI) {
2430    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2431      if (!SE.isSCEVable(Oper->getType()))
2432        continue;
2433
2434      if (const SCEVAddRecExpr *AR =
2435          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2436        if (AR->getLoop() == L)
2437          break;
2438      }
2439    }
2440  }
2441  return OI;
2442}
2443
2444/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
2445/// a convenient helper.
2446static Value *getWideOperand(Value *Oper) {
2447  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2448    return Trunc->getOperand(0);
2449  return Oper;
2450}
2451
2452/// Return true if we allow an IV chain to include both types.
2453static bool isCompatibleIVType(Value *LVal, Value *RVal) {
2454  Type *LType = LVal->getType();
2455  Type *RType = RVal->getType();
2456  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
2457}
2458
2459/// Return an approximation of this SCEV expression's "base", or NULL for any
2460/// constant. Returning the expression itself is conservative. Returning a
2461/// deeper subexpression is more precise and valid as long as it isn't less
2462/// complex than another subexpression. For expressions involving multiple
2463/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2464/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2465/// IVInc==b-a.
2466///
2467/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2468/// SCEVUnknown, we simply return the rightmost SCEV operand.
2469static const SCEV *getExprBase(const SCEV *S) {
2470  switch (S->getSCEVType()) {
2471  default: // uncluding scUnknown.
2472    return S;
2473  case scConstant:
2474    return nullptr;
2475  case scTruncate:
2476    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2477  case scZeroExtend:
2478    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2479  case scSignExtend:
2480    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2481  case scAddExpr: {
2482    // Skip over scaled operands (scMulExpr) to follow add operands as long as
2483    // there's nothing more complex.
2484    // FIXME: not sure if we want to recognize negation.
2485    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2486    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
2487           E(Add->op_begin()); I != E; ++I) {
2488      const SCEV *SubExpr = *I;
2489      if (SubExpr->getSCEVType() == scAddExpr)
2490        return getExprBase(SubExpr);
2491
2492      if (SubExpr->getSCEVType() != scMulExpr)
2493        return SubExpr;
2494    }
2495    return S; // all operands are scaled, be conservative.
2496  }
2497  case scAddRecExpr:
2498    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2499  }
2500}
2501
2502/// Return true if the chain increment is profitable to expand into a loop
2503/// invariant value, which may require its own register. A profitable chain
2504/// increment will be an offset relative to the same base. We allow such offsets
2505/// to potentially be used as chain increment as long as it's not obviously
2506/// expensive to expand using real instructions.
2507bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2508                                    const SCEV *IncExpr,
2509                                    ScalarEvolution &SE) {
2510  // Aggressively form chains when -stress-ivchain.
2511  if (StressIVChain)
2512    return true;
2513
2514  // Do not replace a constant offset from IV head with a nonconstant IV
2515  // increment.
2516  if (!isa<SCEVConstant>(IncExpr)) {
2517    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2518    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2519      return 0;
2520  }
2521
2522  SmallPtrSet<const SCEV*, 8> Processed;
2523  return !isHighCostExpansion(IncExpr, Processed, SE);
2524}
2525
2526/// Return true if the number of registers needed for the chain is estimated to
2527/// be less than the number required for the individual IV users. First prohibit
2528/// any IV users that keep the IV live across increments (the Users set should
2529/// be empty). Next count the number and type of increments in the chain.
2530///
2531/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2532/// effectively use postinc addressing modes. Only consider it profitable it the
2533/// increments can be computed in fewer registers when chained.
2534///
2535/// TODO: Consider IVInc free if it's already used in another chains.
2536static bool
2537isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
2538                  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
2539  if (StressIVChain)
2540    return true;
2541
2542  if (!Chain.hasIncs())
2543    return false;
2544
2545  if (!Users.empty()) {
2546    DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2547          for (Instruction *Inst : Users) {
2548            dbgs() << "  " << *Inst << "\n";
2549          });
2550    return false;
2551  }
2552  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2553
2554  // The chain itself may require a register, so intialize cost to 1.
2555  int cost = 1;
2556
2557  // A complete chain likely eliminates the need for keeping the original IV in
2558  // a register. LSR does not currently know how to form a complete chain unless
2559  // the header phi already exists.
2560  if (isa<PHINode>(Chain.tailUserInst())
2561      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
2562    --cost;
2563  }
2564  const SCEV *LastIncExpr = nullptr;
2565  unsigned NumConstIncrements = 0;
2566  unsigned NumVarIncrements = 0;
2567  unsigned NumReusedIncrements = 0;
2568  for (const IVInc &Inc : Chain) {
2569    if (Inc.IncExpr->isZero())
2570      continue;
2571
2572    // Incrementing by zero or some constant is neutral. We assume constants can
2573    // be folded into an addressing mode or an add's immediate operand.
2574    if (isa<SCEVConstant>(Inc.IncExpr)) {
2575      ++NumConstIncrements;
2576      continue;
2577    }
2578
2579    if (Inc.IncExpr == LastIncExpr)
2580      ++NumReusedIncrements;
2581    else
2582      ++NumVarIncrements;
2583
2584    LastIncExpr = Inc.IncExpr;
2585  }
2586  // An IV chain with a single increment is handled by LSR's postinc
2587  // uses. However, a chain with multiple increments requires keeping the IV's
2588  // value live longer than it needs to be if chained.
2589  if (NumConstIncrements > 1)
2590    --cost;
2591
2592  // Materializing increment expressions in the preheader that didn't exist in
2593  // the original code may cost a register. For example, sign-extended array
2594  // indices can produce ridiculous increments like this:
2595  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2596  cost += NumVarIncrements;
2597
2598  // Reusing variable increments likely saves a register to hold the multiple of
2599  // the stride.
2600  cost -= NumReusedIncrements;
2601
2602  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2603               << "\n");
2604
2605  return cost < 0;
2606}
2607
2608/// Add this IV user to an existing chain or make it the head of a new chain.
2609void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2610                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2611  // When IVs are used as types of varying widths, they are generally converted
2612  // to a wider type with some uses remaining narrow under a (free) trunc.
2613  Value *const NextIV = getWideOperand(IVOper);
2614  const SCEV *const OperExpr = SE.getSCEV(NextIV);
2615  const SCEV *const OperExprBase = getExprBase(OperExpr);
2616
2617  // Visit all existing chains. Check if its IVOper can be computed as a
2618  // profitable loop invariant increment from the last link in the Chain.
2619  unsigned ChainIdx = 0, NChains = IVChainVec.size();
2620  const SCEV *LastIncExpr = nullptr;
2621  for (; ChainIdx < NChains; ++ChainIdx) {
2622    IVChain &Chain = IVChainVec[ChainIdx];
2623
2624    // Prune the solution space aggressively by checking that both IV operands
2625    // are expressions that operate on the same unscaled SCEVUnknown. This
2626    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2627    // first avoids creating extra SCEV expressions.
2628    if (!StressIVChain && Chain.ExprBase != OperExprBase)
2629      continue;
2630
2631    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2632    if (!isCompatibleIVType(PrevIV, NextIV))
2633      continue;
2634
2635    // A phi node terminates a chain.
2636    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
2637      continue;
2638
2639    // The increment must be loop-invariant so it can be kept in a register.
2640    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2641    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2642    if (!SE.isLoopInvariant(IncExpr, L))
2643      continue;
2644
2645    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2646      LastIncExpr = IncExpr;
2647      break;
2648    }
2649  }
2650  // If we haven't found a chain, create a new one, unless we hit the max. Don't
2651  // bother for phi nodes, because they must be last in the chain.
2652  if (ChainIdx == NChains) {
2653    if (isa<PHINode>(UserInst))
2654      return;
2655    if (NChains >= MaxChains && !StressIVChain) {
2656      DEBUG(dbgs() << "IV Chain Limit\n");
2657      return;
2658    }
2659    LastIncExpr = OperExpr;
2660    // IVUsers may have skipped over sign/zero extensions. We don't currently
2661    // attempt to form chains involving extensions unless they can be hoisted
2662    // into this loop's AddRec.
2663    if (!isa<SCEVAddRecExpr>(LastIncExpr))
2664      return;
2665    ++NChains;
2666    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
2667                                 OperExprBase));
2668    ChainUsersVec.resize(NChains);
2669    DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
2670                 << ") IV=" << *LastIncExpr << "\n");
2671  } else {
2672    DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
2673                 << ") IV+" << *LastIncExpr << "\n");
2674    // Add this IV user to the end of the chain.
2675    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
2676  }
2677  IVChain &Chain = IVChainVec[ChainIdx];
2678
2679  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
2680  // This chain's NearUsers become FarUsers.
2681  if (!LastIncExpr->isZero()) {
2682    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
2683                                            NearUsers.end());
2684    NearUsers.clear();
2685  }
2686
2687  // All other uses of IVOperand become near uses of the chain.
2688  // We currently ignore intermediate values within SCEV expressions, assuming
2689  // they will eventually be used be the current chain, or can be computed
2690  // from one of the chain increments. To be more precise we could
2691  // transitively follow its user and only add leaf IV users to the set.
2692  for (User *U : IVOper->users()) {
2693    Instruction *OtherUse = dyn_cast<Instruction>(U);
2694    if (!OtherUse)
2695      continue;
2696    // Uses in the chain will no longer be uses if the chain is formed.
2697    // Include the head of the chain in this iteration (not Chain.begin()).
2698    IVChain::const_iterator IncIter = Chain.Incs.begin();
2699    IVChain::const_iterator IncEnd = Chain.Incs.end();
2700    for( ; IncIter != IncEnd; ++IncIter) {
2701      if (IncIter->UserInst == OtherUse)
2702        break;
2703    }
2704    if (IncIter != IncEnd)
2705      continue;
2706
2707    if (SE.isSCEVable(OtherUse->getType())
2708        && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
2709        && IU.isIVUserOrOperand(OtherUse)) {
2710      continue;
2711    }
2712    NearUsers.insert(OtherUse);
2713  }
2714
2715  // Since this user is part of the chain, it's no longer considered a use
2716  // of the chain.
2717  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
2718}
2719
2720/// Populate the vector of Chains.
2721///
2722/// This decreases ILP at the architecture level. Targets with ample registers,
2723/// multiple memory ports, and no register renaming probably don't want
2724/// this. However, such targets should probably disable LSR altogether.
2725///
2726/// The job of LSR is to make a reasonable choice of induction variables across
2727/// the loop. Subsequent passes can easily "unchain" computation exposing more
2728/// ILP *within the loop* if the target wants it.
2729///
2730/// Finding the best IV chain is potentially a scheduling problem. Since LSR
2731/// will not reorder memory operations, it will recognize this as a chain, but
2732/// will generate redundant IV increments. Ideally this would be corrected later
2733/// by a smart scheduler:
2734///        = A[i]
2735///        = A[i+x]
2736/// A[i]   =
2737/// A[i+x] =
2738///
2739/// TODO: Walk the entire domtree within this loop, not just the path to the
2740/// loop latch. This will discover chains on side paths, but requires
2741/// maintaining multiple copies of the Chains state.
2742void LSRInstance::CollectChains() {
2743  DEBUG(dbgs() << "Collecting IV Chains.\n");
2744  SmallVector<ChainUsers, 8> ChainUsersVec;
2745
2746  SmallVector<BasicBlock *,8> LatchPath;
2747  BasicBlock *LoopHeader = L->getHeader();
2748  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
2749       Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
2750    LatchPath.push_back(Rung->getBlock());
2751  }
2752  LatchPath.push_back(LoopHeader);
2753
2754  // Walk the instruction stream from the loop header to the loop latch.
2755  for (SmallVectorImpl<BasicBlock *>::reverse_iterator
2756         BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
2757       BBIter != BBEnd; ++BBIter) {
2758    for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
2759         I != E; ++I) {
2760      // Skip instructions that weren't seen by IVUsers analysis.
2761      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I))
2762        continue;
2763
2764      // Ignore users that are part of a SCEV expression. This way we only
2765      // consider leaf IV Users. This effectively rediscovers a portion of
2766      // IVUsers analysis but in program order this time.
2767      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I)))
2768        continue;
2769
2770      // Remove this instruction from any NearUsers set it may be in.
2771      for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
2772           ChainIdx < NChains; ++ChainIdx) {
2773        ChainUsersVec[ChainIdx].NearUsers.erase(&*I);
2774      }
2775      // Search for operands that can be chained.
2776      SmallPtrSet<Instruction*, 4> UniqueOperands;
2777      User::op_iterator IVOpEnd = I->op_end();
2778      User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
2779      while (IVOpIter != IVOpEnd) {
2780        Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
2781        if (UniqueOperands.insert(IVOpInst).second)
2782          ChainInstruction(&*I, IVOpInst, ChainUsersVec);
2783        IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
2784      }
2785    } // Continue walking down the instructions.
2786  } // Continue walking down the domtree.
2787  // Visit phi backedges to determine if the chain can generate the IV postinc.
2788  for (BasicBlock::iterator I = L->getHeader()->begin();
2789       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
2790    if (!SE.isSCEVable(PN->getType()))
2791      continue;
2792
2793    Instruction *IncV =
2794      dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
2795    if (IncV)
2796      ChainInstruction(PN, IncV, ChainUsersVec);
2797  }
2798  // Remove any unprofitable chains.
2799  unsigned ChainIdx = 0;
2800  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
2801       UsersIdx < NChains; ++UsersIdx) {
2802    if (!isProfitableChain(IVChainVec[UsersIdx],
2803                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
2804      continue;
2805    // Preserve the chain at UsesIdx.
2806    if (ChainIdx != UsersIdx)
2807      IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
2808    FinalizeChain(IVChainVec[ChainIdx]);
2809    ++ChainIdx;
2810  }
2811  IVChainVec.resize(ChainIdx);
2812}
2813
2814void LSRInstance::FinalizeChain(IVChain &Chain) {
2815  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2816  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
2817
2818  for (const IVInc &Inc : Chain) {
2819    DEBUG(dbgs() << "        Inc: " << Inc.UserInst << "\n");
2820    auto UseI = std::find(Inc.UserInst->op_begin(), Inc.UserInst->op_end(),
2821                          Inc.IVOperand);
2822    assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
2823    IVIncSet.insert(UseI);
2824  }
2825}
2826
2827/// Return true if the IVInc can be folded into an addressing mode.
2828static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
2829                             Value *Operand, const TargetTransformInfo &TTI) {
2830  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
2831  if (!IncConst || !isAddressUse(UserInst, Operand))
2832    return false;
2833
2834  if (IncConst->getAPInt().getMinSignedBits() > 64)
2835    return false;
2836
2837  MemAccessTy AccessTy = getAccessType(UserInst);
2838  int64_t IncOffset = IncConst->getValue()->getSExtValue();
2839  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
2840                        IncOffset, /*HaseBaseReg=*/false))
2841    return false;
2842
2843  return true;
2844}
2845
2846/// Generate an add or subtract for each IVInc in a chain to materialize the IV
2847/// user's operand from the previous IV user's operand.
2848void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
2849                                  SmallVectorImpl<WeakVH> &DeadInsts) {
2850  // Find the new IVOperand for the head of the chain. It may have been replaced
2851  // by LSR.
2852  const IVInc &Head = Chain.Incs[0];
2853  User::op_iterator IVOpEnd = Head.UserInst->op_end();
2854  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
2855  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
2856                                             IVOpEnd, L, SE);
2857  Value *IVSrc = nullptr;
2858  while (IVOpIter != IVOpEnd) {
2859    IVSrc = getWideOperand(*IVOpIter);
2860
2861    // If this operand computes the expression that the chain needs, we may use
2862    // it. (Check this after setting IVSrc which is used below.)
2863    //
2864    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
2865    // narrow for the chain, so we can no longer use it. We do allow using a
2866    // wider phi, assuming the LSR checked for free truncation. In that case we
2867    // should already have a truncate on this operand such that
2868    // getSCEV(IVSrc) == IncExpr.
2869    if (SE.getSCEV(*IVOpIter) == Head.IncExpr
2870        || SE.getSCEV(IVSrc) == Head.IncExpr) {
2871      break;
2872    }
2873    IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
2874  }
2875  if (IVOpIter == IVOpEnd) {
2876    // Gracefully give up on this chain.
2877    DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
2878    return;
2879  }
2880
2881  DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
2882  Type *IVTy = IVSrc->getType();
2883  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
2884  const SCEV *LeftOverExpr = nullptr;
2885  for (const IVInc &Inc : Chain) {
2886    Instruction *InsertPt = Inc.UserInst;
2887    if (isa<PHINode>(InsertPt))
2888      InsertPt = L->getLoopLatch()->getTerminator();
2889
2890    // IVOper will replace the current IV User's operand. IVSrc is the IV
2891    // value currently held in a register.
2892    Value *IVOper = IVSrc;
2893    if (!Inc.IncExpr->isZero()) {
2894      // IncExpr was the result of subtraction of two narrow values, so must
2895      // be signed.
2896      const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
2897      LeftOverExpr = LeftOverExpr ?
2898        SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
2899    }
2900    if (LeftOverExpr && !LeftOverExpr->isZero()) {
2901      // Expand the IV increment.
2902      Rewriter.clearPostInc();
2903      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
2904      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
2905                                             SE.getUnknown(IncV));
2906      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
2907
2908      // If an IV increment can't be folded, use it as the next IV value.
2909      if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
2910        assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
2911        IVSrc = IVOper;
2912        LeftOverExpr = nullptr;
2913      }
2914    }
2915    Type *OperTy = Inc.IVOperand->getType();
2916    if (IVTy != OperTy) {
2917      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
2918             "cannot extend a chained IV");
2919      IRBuilder<> Builder(InsertPt);
2920      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
2921    }
2922    Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
2923    DeadInsts.emplace_back(Inc.IVOperand);
2924  }
2925  // If LSR created a new, wider phi, we may also replace its postinc. We only
2926  // do this if we also found a wide value for the head of the chain.
2927  if (isa<PHINode>(Chain.tailUserInst())) {
2928    for (BasicBlock::iterator I = L->getHeader()->begin();
2929         PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
2930      if (!isCompatibleIVType(Phi, IVSrc))
2931        continue;
2932      Instruction *PostIncV = dyn_cast<Instruction>(
2933        Phi->getIncomingValueForBlock(L->getLoopLatch()));
2934      if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
2935        continue;
2936      Value *IVOper = IVSrc;
2937      Type *PostIncTy = PostIncV->getType();
2938      if (IVTy != PostIncTy) {
2939        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
2940        IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
2941        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
2942        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
2943      }
2944      Phi->replaceUsesOfWith(PostIncV, IVOper);
2945      DeadInsts.emplace_back(PostIncV);
2946    }
2947  }
2948}
2949
2950void LSRInstance::CollectFixupsAndInitialFormulae() {
2951  for (const IVStrideUse &U : IU) {
2952    Instruction *UserInst = U.getUser();
2953    // Skip IV users that are part of profitable IV Chains.
2954    User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
2955                                       U.getOperandValToReplace());
2956    assert(UseI != UserInst->op_end() && "cannot find IV operand");
2957    if (IVIncSet.count(UseI))
2958      continue;
2959
2960    // Record the uses.
2961    LSRFixup &LF = getNewFixup();
2962    LF.UserInst = UserInst;
2963    LF.OperandValToReplace = U.getOperandValToReplace();
2964    LF.PostIncLoops = U.getPostIncLoops();
2965
2966    LSRUse::KindType Kind = LSRUse::Basic;
2967    MemAccessTy AccessTy;
2968    if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
2969      Kind = LSRUse::Address;
2970      AccessTy = getAccessType(LF.UserInst);
2971    }
2972
2973    const SCEV *S = IU.getExpr(U);
2974
2975    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2976    // (N - i == 0), and this allows (N - i) to be the expression that we work
2977    // with rather than just N or i, so we can consider the register
2978    // requirements for both N and i at the same time. Limiting this code to
2979    // equality icmps is not a problem because all interesting loops use
2980    // equality icmps, thanks to IndVarSimplify.
2981    if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
2982      if (CI->isEquality()) {
2983        // Swap the operands if needed to put the OperandValToReplace on the
2984        // left, for consistency.
2985        Value *NV = CI->getOperand(1);
2986        if (NV == LF.OperandValToReplace) {
2987          CI->setOperand(1, CI->getOperand(0));
2988          CI->setOperand(0, NV);
2989          NV = CI->getOperand(1);
2990          Changed = true;
2991        }
2992
2993        // x == y  -->  x - y == 0
2994        const SCEV *N = SE.getSCEV(NV);
2995        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
2996          // S is normalized, so normalize N before folding it into S
2997          // to keep the result normalized.
2998          N = TransformForPostIncUse(Normalize, N, CI, nullptr,
2999                                     LF.PostIncLoops, SE, DT);
3000          Kind = LSRUse::ICmpZero;
3001          S = SE.getMinusSCEV(N, S);
3002        }
3003
3004        // -1 and the negations of all interesting strides (except the negation
3005        // of -1) are now also interesting.
3006        for (size_t i = 0, e = Factors.size(); i != e; ++i)
3007          if (Factors[i] != -1)
3008            Factors.insert(-(uint64_t)Factors[i]);
3009        Factors.insert(-1);
3010      }
3011
3012    // Set up the initial formula for this use.
3013    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
3014    LF.LUIdx = P.first;
3015    LF.Offset = P.second;
3016    LSRUse &LU = Uses[LF.LUIdx];
3017    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3018    if (!LU.WidestFixupType ||
3019        SE.getTypeSizeInBits(LU.WidestFixupType) <
3020        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3021      LU.WidestFixupType = LF.OperandValToReplace->getType();
3022
3023    // If this is the first use of this LSRUse, give it a formula.
3024    if (LU.Formulae.empty()) {
3025      InsertInitialFormula(S, LU, LF.LUIdx);
3026      CountRegisters(LU.Formulae.back(), LF.LUIdx);
3027    }
3028  }
3029
3030  DEBUG(print_fixups(dbgs()));
3031}
3032
3033/// Insert a formula for the given expression into the given use, separating out
3034/// loop-variant portions from loop-invariant and loop-computable portions.
3035void
3036LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
3037  // Mark uses whose expressions cannot be expanded.
3038  if (!isSafeToExpand(S, SE))
3039    LU.RigidFormula = true;
3040
3041  Formula F;
3042  F.initialMatch(S, L, SE);
3043  bool Inserted = InsertFormula(LU, LUIdx, F);
3044  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3045}
3046
3047/// Insert a simple single-register formula for the given expression into the
3048/// given use.
3049void
3050LSRInstance::InsertSupplementalFormula(const SCEV *S,
3051                                       LSRUse &LU, size_t LUIdx) {
3052  Formula F;
3053  F.BaseRegs.push_back(S);
3054  F.HasBaseReg = true;
3055  bool Inserted = InsertFormula(LU, LUIdx, F);
3056  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3057}
3058
3059/// Note which registers are used by the given formula, updating RegUses.
3060void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3061  if (F.ScaledReg)
3062    RegUses.countRegister(F.ScaledReg, LUIdx);
3063  for (const SCEV *BaseReg : F.BaseRegs)
3064    RegUses.countRegister(BaseReg, LUIdx);
3065}
3066
3067/// If the given formula has not yet been inserted, add it to the list, and
3068/// return true. Return false otherwise.
3069bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3070  // Do not insert formula that we will not be able to expand.
3071  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3072         "Formula is illegal");
3073  if (!LU.InsertFormula(F))
3074    return false;
3075
3076  CountRegisters(F, LUIdx);
3077  return true;
3078}
3079
3080/// Check for other uses of loop-invariant values which we're tracking. These
3081/// other uses will pin these values in registers, making them less profitable
3082/// for elimination.
3083/// TODO: This currently misses non-constant addrec step registers.
3084/// TODO: Should this give more weight to users inside the loop?
3085void
3086LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3087  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3088  SmallPtrSet<const SCEV *, 32> Visited;
3089
3090  while (!Worklist.empty()) {
3091    const SCEV *S = Worklist.pop_back_val();
3092
3093    // Don't process the same SCEV twice
3094    if (!Visited.insert(S).second)
3095      continue;
3096
3097    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3098      Worklist.append(N->op_begin(), N->op_end());
3099    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
3100      Worklist.push_back(C->getOperand());
3101    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3102      Worklist.push_back(D->getLHS());
3103      Worklist.push_back(D->getRHS());
3104    } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3105      const Value *V = US->getValue();
3106      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3107        // Look for instructions defined outside the loop.
3108        if (L->contains(Inst)) continue;
3109      } else if (isa<UndefValue>(V))
3110        // Undef doesn't have a live range, so it doesn't matter.
3111        continue;
3112      for (const Use &U : V->uses()) {
3113        const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3114        // Ignore non-instructions.
3115        if (!UserInst)
3116          continue;
3117        // Ignore instructions in other functions (as can happen with
3118        // Constants).
3119        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3120          continue;
3121        // Ignore instructions not dominated by the loop.
3122        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3123          UserInst->getParent() :
3124          cast<PHINode>(UserInst)->getIncomingBlock(
3125            PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3126        if (!DT.dominates(L->getHeader(), UseBB))
3127          continue;
3128        // Don't bother if the instruction is in a BB which ends in an EHPad.
3129        if (UseBB->getTerminator()->isEHPad())
3130          continue;
3131        // Ignore uses which are part of other SCEV expressions, to avoid
3132        // analyzing them multiple times.
3133        if (SE.isSCEVable(UserInst->getType())) {
3134          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3135          // If the user is a no-op, look through to its uses.
3136          if (!isa<SCEVUnknown>(UserS))
3137            continue;
3138          if (UserS == US) {
3139            Worklist.push_back(
3140              SE.getUnknown(const_cast<Instruction *>(UserInst)));
3141            continue;
3142          }
3143        }
3144        // Ignore icmp instructions which are already being analyzed.
3145        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3146          unsigned OtherIdx = !U.getOperandNo();
3147          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3148          if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3149            continue;
3150        }
3151
3152        LSRFixup &LF = getNewFixup();
3153        LF.UserInst = const_cast<Instruction *>(UserInst);
3154        LF.OperandValToReplace = U;
3155        std::pair<size_t, int64_t> P = getUse(
3156            S, LSRUse::Basic, MemAccessTy());
3157        LF.LUIdx = P.first;
3158        LF.Offset = P.second;
3159        LSRUse &LU = Uses[LF.LUIdx];
3160        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3161        if (!LU.WidestFixupType ||
3162            SE.getTypeSizeInBits(LU.WidestFixupType) <
3163            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3164          LU.WidestFixupType = LF.OperandValToReplace->getType();
3165        InsertSupplementalFormula(US, LU, LF.LUIdx);
3166        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3167        break;
3168      }
3169    }
3170  }
3171}
3172
3173/// Split S into subexpressions which can be pulled out into separate
3174/// registers. If C is non-null, multiply each subexpression by C.
3175///
3176/// Return remainder expression after factoring the subexpressions captured by
3177/// Ops. If Ops is complete, return NULL.
3178static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3179                                   SmallVectorImpl<const SCEV *> &Ops,
3180                                   const Loop *L,
3181                                   ScalarEvolution &SE,
3182                                   unsigned Depth = 0) {
3183  // Arbitrarily cap recursion to protect compile time.
3184  if (Depth >= 3)
3185    return S;
3186
3187  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3188    // Break out add operands.
3189    for (const SCEV *S : Add->operands()) {
3190      const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3191      if (Remainder)
3192        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3193    }
3194    return nullptr;
3195  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3196    // Split a non-zero base out of an addrec.
3197    if (AR->getStart()->isZero())
3198      return S;
3199
3200    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3201                                            C, Ops, L, SE, Depth+1);
3202    // Split the non-zero AddRec unless it is part of a nested recurrence that
3203    // does not pertain to this loop.
3204    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3205      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3206      Remainder = nullptr;
3207    }
3208    if (Remainder != AR->getStart()) {
3209      if (!Remainder)
3210        Remainder = SE.getConstant(AR->getType(), 0);
3211      return SE.getAddRecExpr(Remainder,
3212                              AR->getStepRecurrence(SE),
3213                              AR->getLoop(),
3214                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3215                              SCEV::FlagAnyWrap);
3216    }
3217  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3218    // Break (C * (a + b + c)) into C*a + C*b + C*c.
3219    if (Mul->getNumOperands() != 2)
3220      return S;
3221    if (const SCEVConstant *Op0 =
3222        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3223      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3224      const SCEV *Remainder =
3225        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3226      if (Remainder)
3227        Ops.push_back(SE.getMulExpr(C, Remainder));
3228      return nullptr;
3229    }
3230  }
3231  return S;
3232}
3233
3234/// \brief Helper function for LSRInstance::GenerateReassociations.
3235void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3236                                             const Formula &Base,
3237                                             unsigned Depth, size_t Idx,
3238                                             bool IsScaledReg) {
3239  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3240  SmallVector<const SCEV *, 8> AddOps;
3241  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3242  if (Remainder)
3243    AddOps.push_back(Remainder);
3244
3245  if (AddOps.size() == 1)
3246    return;
3247
3248  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
3249                                                     JE = AddOps.end();
3250       J != JE; ++J) {
3251
3252    // Loop-variant "unknown" values are uninteresting; we won't be able to
3253    // do anything meaningful with them.
3254    if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3255      continue;
3256
3257    // Don't pull a constant into a register if the constant could be folded
3258    // into an immediate field.
3259    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3260                         LU.AccessTy, *J, Base.getNumRegs() > 1))
3261      continue;
3262
3263    // Collect all operands except *J.
3264    SmallVector<const SCEV *, 8> InnerAddOps(
3265        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3266    InnerAddOps.append(std::next(J),
3267                       ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3268
3269    // Don't leave just a constant behind in a register if the constant could
3270    // be folded into an immediate field.
3271    if (InnerAddOps.size() == 1 &&
3272        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3273                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3274      continue;
3275
3276    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3277    if (InnerSum->isZero())
3278      continue;
3279    Formula F = Base;
3280
3281    // Add the remaining pieces of the add back into the new formula.
3282    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3283    if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3284        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3285                                InnerSumSC->getValue()->getZExtValue())) {
3286      F.UnfoldedOffset =
3287          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
3288      if (IsScaledReg)
3289        F.ScaledReg = nullptr;
3290      else
3291        F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3292    } else if (IsScaledReg)
3293      F.ScaledReg = InnerSum;
3294    else
3295      F.BaseRegs[Idx] = InnerSum;
3296
3297    // Add J as its own register, or an unfolded immediate.
3298    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3299    if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3300        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3301                                SC->getValue()->getZExtValue()))
3302      F.UnfoldedOffset =
3303          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
3304    else
3305      F.BaseRegs.push_back(*J);
3306    // We may have changed the number of register in base regs, adjust the
3307    // formula accordingly.
3308    F.canonicalize();
3309
3310    if (InsertFormula(LU, LUIdx, F))
3311      // If that formula hadn't been seen before, recurse to find more like
3312      // it.
3313      GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
3314  }
3315}
3316
3317/// Split out subexpressions from adds and the bases of addrecs.
3318void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3319                                         Formula Base, unsigned Depth) {
3320  assert(Base.isCanonical() && "Input must be in the canonical form");
3321  // Arbitrarily cap recursion to protect compile time.
3322  if (Depth >= 3)
3323    return;
3324
3325  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3326    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
3327
3328  if (Base.Scale == 1)
3329    GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
3330                               /* Idx */ -1, /* IsScaledReg */ true);
3331}
3332
3333///  Generate a formula consisting of all of the loop-dominating registers added
3334/// into a single register.
3335void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3336                                       Formula Base) {
3337  // This method is only interesting on a plurality of registers.
3338  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
3339    return;
3340
3341  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
3342  // processing the formula.
3343  Base.unscale();
3344  Formula F = Base;
3345  F.BaseRegs.clear();
3346  SmallVector<const SCEV *, 4> Ops;
3347  for (const SCEV *BaseReg : Base.BaseRegs) {
3348    if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3349        !SE.hasComputableLoopEvolution(BaseReg, L))
3350      Ops.push_back(BaseReg);
3351    else
3352      F.BaseRegs.push_back(BaseReg);
3353  }
3354  if (Ops.size() > 1) {
3355    const SCEV *Sum = SE.getAddExpr(Ops);
3356    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3357    // opportunity to fold something. For now, just ignore such cases
3358    // rather than proceed with zero in a register.
3359    if (!Sum->isZero()) {
3360      F.BaseRegs.push_back(Sum);
3361      F.canonicalize();
3362      (void)InsertFormula(LU, LUIdx, F);
3363    }
3364  }
3365}
3366
3367/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
3368void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
3369                                              const Formula &Base, size_t Idx,
3370                                              bool IsScaledReg) {
3371  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3372  GlobalValue *GV = ExtractSymbol(G, SE);
3373  if (G->isZero() || !GV)
3374    return;
3375  Formula F = Base;
3376  F.BaseGV = GV;
3377  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3378    return;
3379  if (IsScaledReg)
3380    F.ScaledReg = G;
3381  else
3382    F.BaseRegs[Idx] = G;
3383  (void)InsertFormula(LU, LUIdx, F);
3384}
3385
3386/// Generate reuse formulae using symbolic offsets.
3387void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3388                                          Formula Base) {
3389  // We can't add a symbolic offset if the address already contains one.
3390  if (Base.BaseGV) return;
3391
3392  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3393    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
3394  if (Base.Scale == 1)
3395    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
3396                                /* IsScaledReg */ true);
3397}
3398
3399/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
3400void LSRInstance::GenerateConstantOffsetsImpl(
3401    LSRUse &LU, unsigned LUIdx, const Formula &Base,
3402    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
3403  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3404  for (int64_t Offset : Worklist) {
3405    Formula F = Base;
3406    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
3407    if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
3408                   LU.AccessTy, F)) {
3409      // Add the offset to the base register.
3410      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
3411      // If it cancelled out, drop the base register, otherwise update it.
3412      if (NewG->isZero()) {
3413        if (IsScaledReg) {
3414          F.Scale = 0;
3415          F.ScaledReg = nullptr;
3416        } else
3417          F.deleteBaseReg(F.BaseRegs[Idx]);
3418        F.canonicalize();
3419      } else if (IsScaledReg)
3420        F.ScaledReg = NewG;
3421      else
3422        F.BaseRegs[Idx] = NewG;
3423
3424      (void)InsertFormula(LU, LUIdx, F);
3425    }
3426  }
3427
3428  int64_t Imm = ExtractImmediate(G, SE);
3429  if (G->isZero() || Imm == 0)
3430    return;
3431  Formula F = Base;
3432  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
3433  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3434    return;
3435  if (IsScaledReg)
3436    F.ScaledReg = G;
3437  else
3438    F.BaseRegs[Idx] = G;
3439  (void)InsertFormula(LU, LUIdx, F);
3440}
3441
3442/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
3443void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
3444                                          Formula Base) {
3445  // TODO: For now, just add the min and max offset, because it usually isn't
3446  // worthwhile looking at everything inbetween.
3447  SmallVector<int64_t, 2> Worklist;
3448  Worklist.push_back(LU.MinOffset);
3449  if (LU.MaxOffset != LU.MinOffset)
3450    Worklist.push_back(LU.MaxOffset);
3451
3452  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3453    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
3454  if (Base.Scale == 1)
3455    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
3456                                /* IsScaledReg */ true);
3457}
3458
3459/// For ICmpZero, check to see if we can scale up the comparison. For example, x
3460/// == y -> x*c == y*c.
3461void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
3462                                         Formula Base) {
3463  if (LU.Kind != LSRUse::ICmpZero) return;
3464
3465  // Determine the integer type for the base formula.
3466  Type *IntTy = Base.getType();
3467  if (!IntTy) return;
3468  if (SE.getTypeSizeInBits(IntTy) > 64) return;
3469
3470  // Don't do this if there is more than one offset.
3471  if (LU.MinOffset != LU.MaxOffset) return;
3472
3473  assert(!Base.BaseGV && "ICmpZero use is not legal!");
3474
3475  // Check each interesting stride.
3476  for (int64_t Factor : Factors) {
3477    // Check that the multiplication doesn't overflow.
3478    if (Base.BaseOffset == INT64_MIN && Factor == -1)
3479      continue;
3480    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
3481    if (NewBaseOffset / Factor != Base.BaseOffset)
3482      continue;
3483    // If the offset will be truncated at this use, check that it is in bounds.
3484    if (!IntTy->isPointerTy() &&
3485        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
3486      continue;
3487
3488    // Check that multiplying with the use offset doesn't overflow.
3489    int64_t Offset = LU.MinOffset;
3490    if (Offset == INT64_MIN && Factor == -1)
3491      continue;
3492    Offset = (uint64_t)Offset * Factor;
3493    if (Offset / Factor != LU.MinOffset)
3494      continue;
3495    // If the offset will be truncated at this use, check that it is in bounds.
3496    if (!IntTy->isPointerTy() &&
3497        !ConstantInt::isValueValidForType(IntTy, Offset))
3498      continue;
3499
3500    Formula F = Base;
3501    F.BaseOffset = NewBaseOffset;
3502
3503    // Check that this scale is legal.
3504    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
3505      continue;
3506
3507    // Compensate for the use having MinOffset built into it.
3508    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
3509
3510    const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3511
3512    // Check that multiplying with each base register doesn't overflow.
3513    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
3514      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
3515      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
3516        goto next;
3517    }
3518
3519    // Check that multiplying with the scaled register doesn't overflow.
3520    if (F.ScaledReg) {
3521      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
3522      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
3523        continue;
3524    }
3525
3526    // Check that multiplying with the unfolded offset doesn't overflow.
3527    if (F.UnfoldedOffset != 0) {
3528      if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
3529        continue;
3530      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
3531      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
3532        continue;
3533      // If the offset will be truncated, check that it is in bounds.
3534      if (!IntTy->isPointerTy() &&
3535          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
3536        continue;
3537    }
3538
3539    // If we make it here and it's legal, add it.
3540    (void)InsertFormula(LU, LUIdx, F);
3541  next:;
3542  }
3543}
3544
3545/// Generate stride factor reuse formulae by making use of scaled-offset address
3546/// modes, for example.
3547void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
3548  // Determine the integer type for the base formula.
3549  Type *IntTy = Base.getType();
3550  if (!IntTy) return;
3551
3552  // If this Formula already has a scaled register, we can't add another one.
3553  // Try to unscale the formula to generate a better scale.
3554  if (Base.Scale != 0 && !Base.unscale())
3555    return;
3556
3557  assert(Base.Scale == 0 && "unscale did not did its job!");
3558
3559  // Check each interesting stride.
3560  for (int64_t Factor : Factors) {
3561    Base.Scale = Factor;
3562    Base.HasBaseReg = Base.BaseRegs.size() > 1;
3563    // Check whether this scale is going to be legal.
3564    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3565                    Base)) {
3566      // As a special-case, handle special out-of-loop Basic users specially.
3567      // TODO: Reconsider this special case.
3568      if (LU.Kind == LSRUse::Basic &&
3569          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
3570                     LU.AccessTy, Base) &&
3571          LU.AllFixupsOutsideLoop)
3572        LU.Kind = LSRUse::Special;
3573      else
3574        continue;
3575    }
3576    // For an ICmpZero, negating a solitary base register won't lead to
3577    // new solutions.
3578    if (LU.Kind == LSRUse::ICmpZero &&
3579        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
3580      continue;
3581    // For each addrec base reg, apply the scale, if possible.
3582    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3583      if (const SCEVAddRecExpr *AR =
3584            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
3585        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3586        if (FactorS->isZero())
3587          continue;
3588        // Divide out the factor, ignoring high bits, since we'll be
3589        // scaling the value back up in the end.
3590        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
3591          // TODO: This could be optimized to avoid all the copying.
3592          Formula F = Base;
3593          F.ScaledReg = Quotient;
3594          F.deleteBaseReg(F.BaseRegs[i]);
3595          // The canonical representation of 1*reg is reg, which is already in
3596          // Base. In that case, do not try to insert the formula, it will be
3597          // rejected anyway.
3598          if (F.Scale == 1 && F.BaseRegs.empty())
3599            continue;
3600          (void)InsertFormula(LU, LUIdx, F);
3601        }
3602      }
3603  }
3604}
3605
3606/// Generate reuse formulae from different IV types.
3607void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
3608  // Don't bother truncating symbolic values.
3609  if (Base.BaseGV) return;
3610
3611  // Determine the integer type for the base formula.
3612  Type *DstTy = Base.getType();
3613  if (!DstTy) return;
3614  DstTy = SE.getEffectiveSCEVType(DstTy);
3615
3616  for (Type *SrcTy : Types) {
3617    if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
3618      Formula F = Base;
3619
3620      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
3621      for (const SCEV *&BaseReg : F.BaseRegs)
3622        BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
3623
3624      // TODO: This assumes we've done basic processing on all uses and
3625      // have an idea what the register usage is.
3626      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
3627        continue;
3628
3629      (void)InsertFormula(LU, LUIdx, F);
3630    }
3631  }
3632}
3633
3634namespace {
3635
3636/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
3637/// modifications so that the search phase doesn't have to worry about the data
3638/// structures moving underneath it.
3639struct WorkItem {
3640  size_t LUIdx;
3641  int64_t Imm;
3642  const SCEV *OrigReg;
3643
3644  WorkItem(size_t LI, int64_t I, const SCEV *R)
3645    : LUIdx(LI), Imm(I), OrigReg(R) {}
3646
3647  void print(raw_ostream &OS) const;
3648  void dump() const;
3649};
3650
3651}
3652
3653void WorkItem::print(raw_ostream &OS) const {
3654  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
3655     << " , add offset " << Imm;
3656}
3657
3658LLVM_DUMP_METHOD
3659void WorkItem::dump() const {
3660  print(errs()); errs() << '\n';
3661}
3662
3663/// Look for registers which are a constant distance apart and try to form reuse
3664/// opportunities between them.
3665void LSRInstance::GenerateCrossUseConstantOffsets() {
3666  // Group the registers by their value without any added constant offset.
3667  typedef std::map<int64_t, const SCEV *> ImmMapTy;
3668  DenseMap<const SCEV *, ImmMapTy> Map;
3669  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
3670  SmallVector<const SCEV *, 8> Sequence;
3671  for (const SCEV *Use : RegUses) {
3672    const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
3673    int64_t Imm = ExtractImmediate(Reg, SE);
3674    auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
3675    if (Pair.second)
3676      Sequence.push_back(Reg);
3677    Pair.first->second.insert(std::make_pair(Imm, Use));
3678    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
3679  }
3680
3681  // Now examine each set of registers with the same base value. Build up
3682  // a list of work to do and do the work in a separate step so that we're
3683  // not adding formulae and register counts while we're searching.
3684  SmallVector<WorkItem, 32> WorkItems;
3685  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
3686  for (const SCEV *Reg : Sequence) {
3687    const ImmMapTy &Imms = Map.find(Reg)->second;
3688
3689    // It's not worthwhile looking for reuse if there's only one offset.
3690    if (Imms.size() == 1)
3691      continue;
3692
3693    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
3694          for (const auto &Entry : Imms)
3695            dbgs() << ' ' << Entry.first;
3696          dbgs() << '\n');
3697
3698    // Examine each offset.
3699    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
3700         J != JE; ++J) {
3701      const SCEV *OrigReg = J->second;
3702
3703      int64_t JImm = J->first;
3704      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
3705
3706      if (!isa<SCEVConstant>(OrigReg) &&
3707          UsedByIndicesMap[Reg].count() == 1) {
3708        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
3709        continue;
3710      }
3711
3712      // Conservatively examine offsets between this orig reg a few selected
3713      // other orig regs.
3714      ImmMapTy::const_iterator OtherImms[] = {
3715        Imms.begin(), std::prev(Imms.end()),
3716        Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
3717                         2)
3718      };
3719      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
3720        ImmMapTy::const_iterator M = OtherImms[i];
3721        if (M == J || M == JE) continue;
3722
3723        // Compute the difference between the two.
3724        int64_t Imm = (uint64_t)JImm - M->first;
3725        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
3726             LUIdx = UsedByIndices.find_next(LUIdx))
3727          // Make a memo of this use, offset, and register tuple.
3728          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
3729            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
3730      }
3731    }
3732  }
3733
3734  Map.clear();
3735  Sequence.clear();
3736  UsedByIndicesMap.clear();
3737  UniqueItems.clear();
3738
3739  // Now iterate through the worklist and add new formulae.
3740  for (const WorkItem &WI : WorkItems) {
3741    size_t LUIdx = WI.LUIdx;
3742    LSRUse &LU = Uses[LUIdx];
3743    int64_t Imm = WI.Imm;
3744    const SCEV *OrigReg = WI.OrigReg;
3745
3746    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
3747    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
3748    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
3749
3750    // TODO: Use a more targeted data structure.
3751    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
3752      Formula F = LU.Formulae[L];
3753      // FIXME: The code for the scaled and unscaled registers looks
3754      // very similar but slightly different. Investigate if they
3755      // could be merged. That way, we would not have to unscale the
3756      // Formula.
3757      F.unscale();
3758      // Use the immediate in the scaled register.
3759      if (F.ScaledReg == OrigReg) {
3760        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
3761        // Don't create 50 + reg(-50).
3762        if (F.referencesReg(SE.getSCEV(
3763                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
3764          continue;
3765        Formula NewF = F;
3766        NewF.BaseOffset = Offset;
3767        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3768                        NewF))
3769          continue;
3770        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
3771
3772        // If the new scale is a constant in a register, and adding the constant
3773        // value to the immediate would produce a value closer to zero than the
3774        // immediate itself, then the formula isn't worthwhile.
3775        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
3776          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
3777              (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
3778                  .ule(std::abs(NewF.BaseOffset)))
3779            continue;
3780
3781        // OK, looks good.
3782        NewF.canonicalize();
3783        (void)InsertFormula(LU, LUIdx, NewF);
3784      } else {
3785        // Use the immediate in a base register.
3786        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
3787          const SCEV *BaseReg = F.BaseRegs[N];
3788          if (BaseReg != OrigReg)
3789            continue;
3790          Formula NewF = F;
3791          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
3792          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
3793                          LU.Kind, LU.AccessTy, NewF)) {
3794            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
3795              continue;
3796            NewF = F;
3797            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
3798          }
3799          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
3800
3801          // If the new formula has a constant in a register, and adding the
3802          // constant value to the immediate would produce a value closer to
3803          // zero than the immediate itself, then the formula isn't worthwhile.
3804          for (const SCEV *NewReg : NewF.BaseRegs)
3805            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
3806              if ((C->getAPInt() + NewF.BaseOffset)
3807                      .abs()
3808                      .slt(std::abs(NewF.BaseOffset)) &&
3809                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
3810                      countTrailingZeros<uint64_t>(NewF.BaseOffset))
3811                goto skip_formula;
3812
3813          // Ok, looks good.
3814          NewF.canonicalize();
3815          (void)InsertFormula(LU, LUIdx, NewF);
3816          break;
3817        skip_formula:;
3818        }
3819      }
3820    }
3821  }
3822}
3823
3824/// Generate formulae for each use.
3825void
3826LSRInstance::GenerateAllReuseFormulae() {
3827  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
3828  // queries are more precise.
3829  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3830    LSRUse &LU = Uses[LUIdx];
3831    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3832      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
3833    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3834      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
3835  }
3836  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3837    LSRUse &LU = Uses[LUIdx];
3838    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3839      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
3840    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3841      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
3842    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3843      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
3844    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3845      GenerateScales(LU, LUIdx, LU.Formulae[i]);
3846  }
3847  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3848    LSRUse &LU = Uses[LUIdx];
3849    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3850      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
3851  }
3852
3853  GenerateCrossUseConstantOffsets();
3854
3855  DEBUG(dbgs() << "\n"
3856                  "After generating reuse formulae:\n";
3857        print_uses(dbgs()));
3858}
3859
3860/// If there are multiple formulae with the same set of registers used
3861/// by other uses, pick the best one and delete the others.
3862void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
3863  DenseSet<const SCEV *> VisitedRegs;
3864  SmallPtrSet<const SCEV *, 16> Regs;
3865  SmallPtrSet<const SCEV *, 16> LoserRegs;
3866#ifndef NDEBUG
3867  bool ChangedFormulae = false;
3868#endif
3869
3870  // Collect the best formula for each unique set of shared registers. This
3871  // is reset for each use.
3872  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
3873    BestFormulaeTy;
3874  BestFormulaeTy BestFormulae;
3875
3876  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3877    LSRUse &LU = Uses[LUIdx];
3878    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
3879
3880    bool Any = false;
3881    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
3882         FIdx != NumForms; ++FIdx) {
3883      Formula &F = LU.Formulae[FIdx];
3884
3885      // Some formulas are instant losers. For example, they may depend on
3886      // nonexistent AddRecs from other loops. These need to be filtered
3887      // immediately, otherwise heuristics could choose them over others leading
3888      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
3889      // avoids the need to recompute this information across formulae using the
3890      // same bad AddRec. Passing LoserRegs is also essential unless we remove
3891      // the corresponding bad register from the Regs set.
3892      Cost CostF;
3893      Regs.clear();
3894      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
3895                        &LoserRegs);
3896      if (CostF.isLoser()) {
3897        // During initial formula generation, undesirable formulae are generated
3898        // by uses within other loops that have some non-trivial address mode or
3899        // use the postinc form of the IV. LSR needs to provide these formulae
3900        // as the basis of rediscovering the desired formula that uses an AddRec
3901        // corresponding to the existing phi. Once all formulae have been
3902        // generated, these initial losers may be pruned.
3903        DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
3904              dbgs() << "\n");
3905      }
3906      else {
3907        SmallVector<const SCEV *, 4> Key;
3908        for (const SCEV *Reg : F.BaseRegs) {
3909          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
3910            Key.push_back(Reg);
3911        }
3912        if (F.ScaledReg &&
3913            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
3914          Key.push_back(F.ScaledReg);
3915        // Unstable sort by host order ok, because this is only used for
3916        // uniquifying.
3917        std::sort(Key.begin(), Key.end());
3918
3919        std::pair<BestFormulaeTy::const_iterator, bool> P =
3920          BestFormulae.insert(std::make_pair(Key, FIdx));
3921        if (P.second)
3922          continue;
3923
3924        Formula &Best = LU.Formulae[P.first->second];
3925
3926        Cost CostBest;
3927        Regs.clear();
3928        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
3929                             DT, LU);
3930        if (CostF < CostBest)
3931          std::swap(F, Best);
3932        DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
3933              dbgs() << "\n"
3934                        "    in favor of formula "; Best.print(dbgs());
3935              dbgs() << '\n');
3936      }
3937#ifndef NDEBUG
3938      ChangedFormulae = true;
3939#endif
3940      LU.DeleteFormula(F);
3941      --FIdx;
3942      --NumForms;
3943      Any = true;
3944    }
3945
3946    // Now that we've filtered out some formulae, recompute the Regs set.
3947    if (Any)
3948      LU.RecomputeRegs(LUIdx, RegUses);
3949
3950    // Reset this to prepare for the next use.
3951    BestFormulae.clear();
3952  }
3953
3954  DEBUG(if (ChangedFormulae) {
3955          dbgs() << "\n"
3956                    "After filtering out undesirable candidates:\n";
3957          print_uses(dbgs());
3958        });
3959}
3960
3961// This is a rough guess that seems to work fairly well.
3962static const size_t ComplexityLimit = UINT16_MAX;
3963
3964/// Estimate the worst-case number of solutions the solver might have to
3965/// consider. It almost never considers this many solutions because it prune the
3966/// search space, but the pruning isn't always sufficient.
3967size_t LSRInstance::EstimateSearchSpaceComplexity() const {
3968  size_t Power = 1;
3969  for (const LSRUse &LU : Uses) {
3970    size_t FSize = LU.Formulae.size();
3971    if (FSize >= ComplexityLimit) {
3972      Power = ComplexityLimit;
3973      break;
3974    }
3975    Power *= FSize;
3976    if (Power >= ComplexityLimit)
3977      break;
3978  }
3979  return Power;
3980}
3981
3982/// When one formula uses a superset of the registers of another formula, it
3983/// won't help reduce register pressure (though it may not necessarily hurt
3984/// register pressure); remove it to simplify the system.
3985void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
3986  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3987    DEBUG(dbgs() << "The search space is too complex.\n");
3988
3989    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
3990                    "which use a superset of registers used by other "
3991                    "formulae.\n");
3992
3993    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3994      LSRUse &LU = Uses[LUIdx];
3995      bool Any = false;
3996      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3997        Formula &F = LU.Formulae[i];
3998        // Look for a formula with a constant or GV in a register. If the use
3999        // also has a formula with that same value in an immediate field,
4000        // delete the one that uses a register.
4001        for (SmallVectorImpl<const SCEV *>::const_iterator
4002             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4003          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4004            Formula NewF = F;
4005            NewF.BaseOffset += C->getValue()->getSExtValue();
4006            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4007                                (I - F.BaseRegs.begin()));
4008            if (LU.HasFormulaWithSameRegs(NewF)) {
4009              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
4010              LU.DeleteFormula(F);
4011              --i;
4012              --e;
4013              Any = true;
4014              break;
4015            }
4016          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4017            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4018              if (!F.BaseGV) {
4019                Formula NewF = F;
4020                NewF.BaseGV = GV;
4021                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4022                                    (I - F.BaseRegs.begin()));
4023                if (LU.HasFormulaWithSameRegs(NewF)) {
4024                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4025                        dbgs() << '\n');
4026                  LU.DeleteFormula(F);
4027                  --i;
4028                  --e;
4029                  Any = true;
4030                  break;
4031                }
4032              }
4033          }
4034        }
4035      }
4036      if (Any)
4037        LU.RecomputeRegs(LUIdx, RegUses);
4038    }
4039
4040    DEBUG(dbgs() << "After pre-selection:\n";
4041          print_uses(dbgs()));
4042  }
4043}
4044
4045/// When there are many registers for expressions like A, A+1, A+2, etc.,
4046/// allocate a single register for them.
4047void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4048  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4049    return;
4050
4051  DEBUG(dbgs() << "The search space is too complex.\n"
4052                  "Narrowing the search space by assuming that uses separated "
4053                  "by a constant offset will use the same registers.\n");
4054
4055  // This is especially useful for unrolled loops.
4056
4057  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4058    LSRUse &LU = Uses[LUIdx];
4059    for (const Formula &F : LU.Formulae) {
4060      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
4061        continue;
4062
4063      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4064      if (!LUThatHas)
4065        continue;
4066
4067      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4068                              LU.Kind, LU.AccessTy))
4069        continue;
4070
4071      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4072
4073      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4074
4075      // Update the relocs to reference the new use.
4076      for (LSRFixup &Fixup : Fixups) {
4077        if (Fixup.LUIdx == LUIdx) {
4078          Fixup.LUIdx = LUThatHas - &Uses.front();
4079          Fixup.Offset += F.BaseOffset;
4080          // Add the new offset to LUThatHas' offset list.
4081          if (LUThatHas->Offsets.back() != Fixup.Offset) {
4082            LUThatHas->Offsets.push_back(Fixup.Offset);
4083            if (Fixup.Offset > LUThatHas->MaxOffset)
4084              LUThatHas->MaxOffset = Fixup.Offset;
4085            if (Fixup.Offset < LUThatHas->MinOffset)
4086              LUThatHas->MinOffset = Fixup.Offset;
4087          }
4088          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4089        }
4090        if (Fixup.LUIdx == NumUses-1)
4091          Fixup.LUIdx = LUIdx;
4092      }
4093
4094      // Delete formulae from the new use which are no longer legal.
4095      bool Any = false;
4096      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4097        Formula &F = LUThatHas->Formulae[i];
4098        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4099                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4100          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4101                dbgs() << '\n');
4102          LUThatHas->DeleteFormula(F);
4103          --i;
4104          --e;
4105          Any = true;
4106        }
4107      }
4108
4109      if (Any)
4110        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4111
4112      // Delete the old use.
4113      DeleteUse(LU, LUIdx);
4114      --LUIdx;
4115      --NumUses;
4116      break;
4117    }
4118  }
4119
4120  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4121}
4122
4123/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4124/// we've done more filtering, as it may be able to find more formulae to
4125/// eliminate.
4126void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4127  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4128    DEBUG(dbgs() << "The search space is too complex.\n");
4129
4130    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4131                    "undesirable dedicated registers.\n");
4132
4133    FilterOutUndesirableDedicatedRegisters();
4134
4135    DEBUG(dbgs() << "After pre-selection:\n";
4136          print_uses(dbgs()));
4137  }
4138}
4139
4140/// Pick a register which seems likely to be profitable, and then in any use
4141/// which has any reference to that register, delete all formulae which do not
4142/// reference that register.
4143void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
4144  // With all other options exhausted, loop until the system is simple
4145  // enough to handle.
4146  SmallPtrSet<const SCEV *, 4> Taken;
4147  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4148    // Ok, we have too many of formulae on our hands to conveniently handle.
4149    // Use a rough heuristic to thin out the list.
4150    DEBUG(dbgs() << "The search space is too complex.\n");
4151
4152    // Pick the register which is used by the most LSRUses, which is likely
4153    // to be a good reuse register candidate.
4154    const SCEV *Best = nullptr;
4155    unsigned BestNum = 0;
4156    for (const SCEV *Reg : RegUses) {
4157      if (Taken.count(Reg))
4158        continue;
4159      if (!Best)
4160        Best = Reg;
4161      else {
4162        unsigned Count = RegUses.getUsedByIndices(Reg).count();
4163        if (Count > BestNum) {
4164          Best = Reg;
4165          BestNum = Count;
4166        }
4167      }
4168    }
4169
4170    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
4171                 << " will yield profitable reuse.\n");
4172    Taken.insert(Best);
4173
4174    // In any use with formulae which references this register, delete formulae
4175    // which don't reference it.
4176    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4177      LSRUse &LU = Uses[LUIdx];
4178      if (!LU.Regs.count(Best)) continue;
4179
4180      bool Any = false;
4181      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4182        Formula &F = LU.Formulae[i];
4183        if (!F.referencesReg(Best)) {
4184          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
4185          LU.DeleteFormula(F);
4186          --e;
4187          --i;
4188          Any = true;
4189          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
4190          continue;
4191        }
4192      }
4193
4194      if (Any)
4195        LU.RecomputeRegs(LUIdx, RegUses);
4196    }
4197
4198    DEBUG(dbgs() << "After pre-selection:\n";
4199          print_uses(dbgs()));
4200  }
4201}
4202
4203/// If there are an extraordinary number of formulae to choose from, use some
4204/// rough heuristics to prune down the number of formulae. This keeps the main
4205/// solver from taking an extraordinary amount of time in some worst-case
4206/// scenarios.
4207void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
4208  NarrowSearchSpaceByDetectingSupersets();
4209  NarrowSearchSpaceByCollapsingUnrolledCode();
4210  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
4211  NarrowSearchSpaceByPickingWinnerRegs();
4212}
4213
4214/// This is the recursive solver.
4215void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
4216                               Cost &SolutionCost,
4217                               SmallVectorImpl<const Formula *> &Workspace,
4218                               const Cost &CurCost,
4219                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
4220                               DenseSet<const SCEV *> &VisitedRegs) const {
4221  // Some ideas:
4222  //  - prune more:
4223  //    - use more aggressive filtering
4224  //    - sort the formula so that the most profitable solutions are found first
4225  //    - sort the uses too
4226  //  - search faster:
4227  //    - don't compute a cost, and then compare. compare while computing a cost
4228  //      and bail early.
4229  //    - track register sets with SmallBitVector
4230
4231  const LSRUse &LU = Uses[Workspace.size()];
4232
4233  // If this use references any register that's already a part of the
4234  // in-progress solution, consider it a requirement that a formula must
4235  // reference that register in order to be considered. This prunes out
4236  // unprofitable searching.
4237  SmallSetVector<const SCEV *, 4> ReqRegs;
4238  for (const SCEV *S : CurRegs)
4239    if (LU.Regs.count(S))
4240      ReqRegs.insert(S);
4241
4242  SmallPtrSet<const SCEV *, 16> NewRegs;
4243  Cost NewCost;
4244  for (const Formula &F : LU.Formulae) {
4245    // Ignore formulae which may not be ideal in terms of register reuse of
4246    // ReqRegs.  The formula should use all required registers before
4247    // introducing new ones.
4248    int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
4249    for (const SCEV *Reg : ReqRegs) {
4250      if ((F.ScaledReg && F.ScaledReg == Reg) ||
4251          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
4252          F.BaseRegs.end()) {
4253        --NumReqRegsToFind;
4254        if (NumReqRegsToFind == 0)
4255          break;
4256      }
4257    }
4258    if (NumReqRegsToFind != 0) {
4259      // If none of the formulae satisfied the required registers, then we could
4260      // clear ReqRegs and try again. Currently, we simply give up in this case.
4261      continue;
4262    }
4263
4264    // Evaluate the cost of the current formula. If it's already worse than
4265    // the current best, prune the search at that point.
4266    NewCost = CurCost;
4267    NewRegs = CurRegs;
4268    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
4269                        LU);
4270    if (NewCost < SolutionCost) {
4271      Workspace.push_back(&F);
4272      if (Workspace.size() != Uses.size()) {
4273        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
4274                     NewRegs, VisitedRegs);
4275        if (F.getNumRegs() == 1 && Workspace.size() == 1)
4276          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
4277      } else {
4278        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
4279              dbgs() << ".\n Regs:";
4280              for (const SCEV *S : NewRegs)
4281                dbgs() << ' ' << *S;
4282              dbgs() << '\n');
4283
4284        SolutionCost = NewCost;
4285        Solution = Workspace;
4286      }
4287      Workspace.pop_back();
4288    }
4289  }
4290}
4291
4292/// Choose one formula from each use. Return the results in the given Solution
4293/// vector.
4294void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
4295  SmallVector<const Formula *, 8> Workspace;
4296  Cost SolutionCost;
4297  SolutionCost.Lose();
4298  Cost CurCost;
4299  SmallPtrSet<const SCEV *, 16> CurRegs;
4300  DenseSet<const SCEV *> VisitedRegs;
4301  Workspace.reserve(Uses.size());
4302
4303  // SolveRecurse does all the work.
4304  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
4305               CurRegs, VisitedRegs);
4306  if (Solution.empty()) {
4307    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
4308    return;
4309  }
4310
4311  // Ok, we've now made all our decisions.
4312  DEBUG(dbgs() << "\n"
4313                  "The chosen solution requires "; SolutionCost.print(dbgs());
4314        dbgs() << ":\n";
4315        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
4316          dbgs() << "  ";
4317          Uses[i].print(dbgs());
4318          dbgs() << "\n"
4319                    "    ";
4320          Solution[i]->print(dbgs());
4321          dbgs() << '\n';
4322        });
4323
4324  assert(Solution.size() == Uses.size() && "Malformed solution!");
4325}
4326
4327/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
4328/// we can go while still being dominated by the input positions. This helps
4329/// canonicalize the insert position, which encourages sharing.
4330BasicBlock::iterator
4331LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
4332                                 const SmallVectorImpl<Instruction *> &Inputs)
4333                                                                         const {
4334  for (;;) {
4335    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
4336    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
4337
4338    BasicBlock *IDom;
4339    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
4340      if (!Rung) return IP;
4341      Rung = Rung->getIDom();
4342      if (!Rung) return IP;
4343      IDom = Rung->getBlock();
4344
4345      // Don't climb into a loop though.
4346      const Loop *IDomLoop = LI.getLoopFor(IDom);
4347      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
4348      if (IDomDepth <= IPLoopDepth &&
4349          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
4350        break;
4351    }
4352
4353    bool AllDominate = true;
4354    Instruction *BetterPos = nullptr;
4355    Instruction *Tentative = IDom->getTerminator();
4356    for (Instruction *Inst : Inputs) {
4357      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
4358        AllDominate = false;
4359        break;
4360      }
4361      // Attempt to find an insert position in the middle of the block,
4362      // instead of at the end, so that it can be used for other expansions.
4363      if (IDom == Inst->getParent() &&
4364          (!BetterPos || !DT.dominates(Inst, BetterPos)))
4365        BetterPos = &*std::next(BasicBlock::iterator(Inst));
4366    }
4367    if (!AllDominate)
4368      break;
4369    if (BetterPos)
4370      IP = BetterPos->getIterator();
4371    else
4372      IP = Tentative->getIterator();
4373  }
4374
4375  return IP;
4376}
4377
4378/// Determine an input position which will be dominated by the operands and
4379/// which will dominate the result.
4380BasicBlock::iterator
4381LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
4382                                           const LSRFixup &LF,
4383                                           const LSRUse &LU,
4384                                           SCEVExpander &Rewriter) const {
4385  // Collect some instructions which must be dominated by the
4386  // expanding replacement. These must be dominated by any operands that
4387  // will be required in the expansion.
4388  SmallVector<Instruction *, 4> Inputs;
4389  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
4390    Inputs.push_back(I);
4391  if (LU.Kind == LSRUse::ICmpZero)
4392    if (Instruction *I =
4393          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
4394      Inputs.push_back(I);
4395  if (LF.PostIncLoops.count(L)) {
4396    if (LF.isUseFullyOutsideLoop(L))
4397      Inputs.push_back(L->getLoopLatch()->getTerminator());
4398    else
4399      Inputs.push_back(IVIncInsertPos);
4400  }
4401  // The expansion must also be dominated by the increment positions of any
4402  // loops it for which it is using post-inc mode.
4403  for (const Loop *PIL : LF.PostIncLoops) {
4404    if (PIL == L) continue;
4405
4406    // Be dominated by the loop exit.
4407    SmallVector<BasicBlock *, 4> ExitingBlocks;
4408    PIL->getExitingBlocks(ExitingBlocks);
4409    if (!ExitingBlocks.empty()) {
4410      BasicBlock *BB = ExitingBlocks[0];
4411      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
4412        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
4413      Inputs.push_back(BB->getTerminator());
4414    }
4415  }
4416
4417  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
4418         && !isa<DbgInfoIntrinsic>(LowestIP) &&
4419         "Insertion point must be a normal instruction");
4420
4421  // Then, climb up the immediate dominator tree as far as we can go while
4422  // still being dominated by the input positions.
4423  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
4424
4425  // Don't insert instructions before PHI nodes.
4426  while (isa<PHINode>(IP)) ++IP;
4427
4428  // Ignore landingpad instructions.
4429  while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP;
4430
4431  // Ignore debug intrinsics.
4432  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
4433
4434  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
4435  // IP consistent across expansions and allows the previously inserted
4436  // instructions to be reused by subsequent expansion.
4437  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
4438    ++IP;
4439
4440  return IP;
4441}
4442
4443/// Emit instructions for the leading candidate expression for this LSRUse (this
4444/// is called "expanding").
4445Value *LSRInstance::Expand(const LSRFixup &LF,
4446                           const Formula &F,
4447                           BasicBlock::iterator IP,
4448                           SCEVExpander &Rewriter,
4449                           SmallVectorImpl<WeakVH> &DeadInsts) const {
4450  const LSRUse &LU = Uses[LF.LUIdx];
4451  if (LU.RigidFormula)
4452    return LF.OperandValToReplace;
4453
4454  // Determine an input position which will be dominated by the operands and
4455  // which will dominate the result.
4456  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
4457
4458  // Inform the Rewriter if we have a post-increment use, so that it can
4459  // perform an advantageous expansion.
4460  Rewriter.setPostInc(LF.PostIncLoops);
4461
4462  // This is the type that the user actually needs.
4463  Type *OpTy = LF.OperandValToReplace->getType();
4464  // This will be the type that we'll initially expand to.
4465  Type *Ty = F.getType();
4466  if (!Ty)
4467    // No type known; just expand directly to the ultimate type.
4468    Ty = OpTy;
4469  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
4470    // Expand directly to the ultimate type if it's the right size.
4471    Ty = OpTy;
4472  // This is the type to do integer arithmetic in.
4473  Type *IntTy = SE.getEffectiveSCEVType(Ty);
4474
4475  // Build up a list of operands to add together to form the full base.
4476  SmallVector<const SCEV *, 8> Ops;
4477
4478  // Expand the BaseRegs portion.
4479  for (const SCEV *Reg : F.BaseRegs) {
4480    assert(!Reg->isZero() && "Zero allocated in a base register!");
4481
4482    // If we're expanding for a post-inc user, make the post-inc adjustment.
4483    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4484    Reg = TransformForPostIncUse(Denormalize, Reg,
4485                                 LF.UserInst, LF.OperandValToReplace,
4486                                 Loops, SE, DT);
4487
4488    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));
4489  }
4490
4491  // Expand the ScaledReg portion.
4492  Value *ICmpScaledV = nullptr;
4493  if (F.Scale != 0) {
4494    const SCEV *ScaledS = F.ScaledReg;
4495
4496    // If we're expanding for a post-inc user, make the post-inc adjustment.
4497    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4498    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
4499                                     LF.UserInst, LF.OperandValToReplace,
4500                                     Loops, SE, DT);
4501
4502    if (LU.Kind == LSRUse::ICmpZero) {
4503      // Expand ScaleReg as if it was part of the base regs.
4504      if (F.Scale == 1)
4505        Ops.push_back(
4506            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));
4507      else {
4508        // An interesting way of "folding" with an icmp is to use a negated
4509        // scale, which we'll implement by inserting it into the other operand
4510        // of the icmp.
4511        assert(F.Scale == -1 &&
4512               "The only scale supported by ICmpZero uses is -1!");
4513        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);
4514      }
4515    } else {
4516      // Otherwise just expand the scaled register and an explicit scale,
4517      // which is expected to be matched as part of the address.
4518
4519      // Flush the operand list to suppress SCEVExpander hoisting address modes.
4520      // Unless the addressing mode will not be folded.
4521      if (!Ops.empty() && LU.Kind == LSRUse::Address &&
4522          isAMCompletelyFolded(TTI, LU, F)) {
4523        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
4524        Ops.clear();
4525        Ops.push_back(SE.getUnknown(FullV));
4526      }
4527      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));
4528      if (F.Scale != 1)
4529        ScaledS =
4530            SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
4531      Ops.push_back(ScaledS);
4532    }
4533  }
4534
4535  // Expand the GV portion.
4536  if (F.BaseGV) {
4537    // Flush the operand list to suppress SCEVExpander hoisting.
4538    if (!Ops.empty()) {
4539      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
4540      Ops.clear();
4541      Ops.push_back(SE.getUnknown(FullV));
4542    }
4543    Ops.push_back(SE.getUnknown(F.BaseGV));
4544  }
4545
4546  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
4547  // unfolded offsets. LSR assumes they both live next to their uses.
4548  if (!Ops.empty()) {
4549    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
4550    Ops.clear();
4551    Ops.push_back(SE.getUnknown(FullV));
4552  }
4553
4554  // Expand the immediate portion.
4555  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
4556  if (Offset != 0) {
4557    if (LU.Kind == LSRUse::ICmpZero) {
4558      // The other interesting way of "folding" with an ICmpZero is to use a
4559      // negated immediate.
4560      if (!ICmpScaledV)
4561        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
4562      else {
4563        Ops.push_back(SE.getUnknown(ICmpScaledV));
4564        ICmpScaledV = ConstantInt::get(IntTy, Offset);
4565      }
4566    } else {
4567      // Just add the immediate values. These again are expected to be matched
4568      // as part of the address.
4569      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
4570    }
4571  }
4572
4573  // Expand the unfolded offset portion.
4574  int64_t UnfoldedOffset = F.UnfoldedOffset;
4575  if (UnfoldedOffset != 0) {
4576    // Just add the immediate values.
4577    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
4578                                                       UnfoldedOffset)));
4579  }
4580
4581  // Emit instructions summing all the operands.
4582  const SCEV *FullS = Ops.empty() ?
4583                      SE.getConstant(IntTy, 0) :
4584                      SE.getAddExpr(Ops);
4585  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);
4586
4587  // We're done expanding now, so reset the rewriter.
4588  Rewriter.clearPostInc();
4589
4590  // An ICmpZero Formula represents an ICmp which we're handling as a
4591  // comparison against zero. Now that we've expanded an expression for that
4592  // form, update the ICmp's other operand.
4593  if (LU.Kind == LSRUse::ICmpZero) {
4594    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
4595    DeadInsts.emplace_back(CI->getOperand(1));
4596    assert(!F.BaseGV && "ICmp does not support folding a global value and "
4597                           "a scale at the same time!");
4598    if (F.Scale == -1) {
4599      if (ICmpScaledV->getType() != OpTy) {
4600        Instruction *Cast =
4601          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
4602                                                   OpTy, false),
4603                           ICmpScaledV, OpTy, "tmp", CI);
4604        ICmpScaledV = Cast;
4605      }
4606      CI->setOperand(1, ICmpScaledV);
4607    } else {
4608      // A scale of 1 means that the scale has been expanded as part of the
4609      // base regs.
4610      assert((F.Scale == 0 || F.Scale == 1) &&
4611             "ICmp does not support folding a global value and "
4612             "a scale at the same time!");
4613      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
4614                                           -(uint64_t)Offset);
4615      if (C->getType() != OpTy)
4616        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
4617                                                          OpTy, false),
4618                                  C, OpTy);
4619
4620      CI->setOperand(1, C);
4621    }
4622  }
4623
4624  return FullV;
4625}
4626
4627/// Helper for Rewrite. PHI nodes are special because the use of their operands
4628/// effectively happens in their predecessor blocks, so the expression may need
4629/// to be expanded in multiple places.
4630void LSRInstance::RewriteForPHI(PHINode *PN,
4631                                const LSRFixup &LF,
4632                                const Formula &F,
4633                                SCEVExpander &Rewriter,
4634                                SmallVectorImpl<WeakVH> &DeadInsts) const {
4635  DenseMap<BasicBlock *, Value *> Inserted;
4636  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
4637    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
4638      BasicBlock *BB = PN->getIncomingBlock(i);
4639
4640      // If this is a critical edge, split the edge so that we do not insert
4641      // the code on all predecessor/successor paths.  We do this unless this
4642      // is the canonical backedge for this loop, which complicates post-inc
4643      // users.
4644      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
4645          !isa<IndirectBrInst>(BB->getTerminator())) {
4646        BasicBlock *Parent = PN->getParent();
4647        Loop *PNLoop = LI.getLoopFor(Parent);
4648        if (!PNLoop || Parent != PNLoop->getHeader()) {
4649          // Split the critical edge.
4650          BasicBlock *NewBB = nullptr;
4651          if (!Parent->isLandingPad()) {
4652            NewBB = SplitCriticalEdge(BB, Parent,
4653                                      CriticalEdgeSplittingOptions(&DT, &LI)
4654                                          .setMergeIdenticalEdges()
4655                                          .setDontDeleteUselessPHIs());
4656          } else {
4657            SmallVector<BasicBlock*, 2> NewBBs;
4658            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
4659            NewBB = NewBBs[0];
4660          }
4661          // If NewBB==NULL, then SplitCriticalEdge refused to split because all
4662          // phi predecessors are identical. The simple thing to do is skip
4663          // splitting in this case rather than complicate the API.
4664          if (NewBB) {
4665            // If PN is outside of the loop and BB is in the loop, we want to
4666            // move the block to be immediately before the PHI block, not
4667            // immediately after BB.
4668            if (L->contains(BB) && !L->contains(PN))
4669              NewBB->moveBefore(PN->getParent());
4670
4671            // Splitting the edge can reduce the number of PHI entries we have.
4672            e = PN->getNumIncomingValues();
4673            BB = NewBB;
4674            i = PN->getBasicBlockIndex(BB);
4675          }
4676        }
4677      }
4678
4679      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
4680        Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
4681      if (!Pair.second)
4682        PN->setIncomingValue(i, Pair.first->second);
4683      else {
4684        Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(),
4685                              Rewriter, DeadInsts);
4686
4687        // If this is reuse-by-noop-cast, insert the noop cast.
4688        Type *OpTy = LF.OperandValToReplace->getType();
4689        if (FullV->getType() != OpTy)
4690          FullV =
4691            CastInst::Create(CastInst::getCastOpcode(FullV, false,
4692                                                     OpTy, false),
4693                             FullV, LF.OperandValToReplace->getType(),
4694                             "tmp", BB->getTerminator());
4695
4696        PN->setIncomingValue(i, FullV);
4697        Pair.first->second = FullV;
4698      }
4699    }
4700}
4701
4702/// Emit instructions for the leading candidate expression for this LSRUse (this
4703/// is called "expanding"), and update the UserInst to reference the newly
4704/// expanded value.
4705void LSRInstance::Rewrite(const LSRFixup &LF,
4706                          const Formula &F,
4707                          SCEVExpander &Rewriter,
4708                          SmallVectorImpl<WeakVH> &DeadInsts) const {
4709  // First, find an insertion point that dominates UserInst. For PHI nodes,
4710  // find the nearest block which dominates all the relevant uses.
4711  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
4712    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);
4713  } else {
4714    Value *FullV =
4715        Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
4716
4717    // If this is reuse-by-noop-cast, insert the noop cast.
4718    Type *OpTy = LF.OperandValToReplace->getType();
4719    if (FullV->getType() != OpTy) {
4720      Instruction *Cast =
4721        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
4722                         FullV, OpTy, "tmp", LF.UserInst);
4723      FullV = Cast;
4724    }
4725
4726    // Update the user. ICmpZero is handled specially here (for now) because
4727    // Expand may have updated one of the operands of the icmp already, and
4728    // its new value may happen to be equal to LF.OperandValToReplace, in
4729    // which case doing replaceUsesOfWith leads to replacing both operands
4730    // with the same value. TODO: Reorganize this.
4731    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
4732      LF.UserInst->setOperand(0, FullV);
4733    else
4734      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
4735  }
4736
4737  DeadInsts.emplace_back(LF.OperandValToReplace);
4738}
4739
4740/// Rewrite all the fixup locations with new values, following the chosen
4741/// solution.
4742void LSRInstance::ImplementSolution(
4743    const SmallVectorImpl<const Formula *> &Solution) {
4744  // Keep track of instructions we may have made dead, so that
4745  // we can remove them after we are done working.
4746  SmallVector<WeakVH, 16> DeadInsts;
4747
4748  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
4749                        "lsr");
4750#ifndef NDEBUG
4751  Rewriter.setDebugType(DEBUG_TYPE);
4752#endif
4753  Rewriter.disableCanonicalMode();
4754  Rewriter.enableLSRMode();
4755  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
4756
4757  // Mark phi nodes that terminate chains so the expander tries to reuse them.
4758  for (const IVChain &Chain : IVChainVec) {
4759    if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
4760      Rewriter.setChainedPhi(PN);
4761  }
4762
4763  // Expand the new value definitions and update the users.
4764  for (const LSRFixup &Fixup : Fixups) {
4765    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);
4766
4767    Changed = true;
4768  }
4769
4770  for (const IVChain &Chain : IVChainVec) {
4771    GenerateIVChain(Chain, Rewriter, DeadInsts);
4772    Changed = true;
4773  }
4774  // Clean up after ourselves. This must be done before deleting any
4775  // instructions.
4776  Rewriter.clear();
4777
4778  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
4779}
4780
4781LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
4782                         DominatorTree &DT, LoopInfo &LI,
4783                         const TargetTransformInfo &TTI)
4784    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
4785      IVIncInsertPos(nullptr) {
4786  // If LoopSimplify form is not available, stay out of trouble.
4787  if (!L->isLoopSimplifyForm())
4788    return;
4789
4790  // If there's no interesting work to be done, bail early.
4791  if (IU.empty()) return;
4792
4793  // If there's too much analysis to be done, bail early. We won't be able to
4794  // model the problem anyway.
4795  unsigned NumUsers = 0;
4796  for (const IVStrideUse &U : IU) {
4797    if (++NumUsers > MaxIVUsers) {
4798      (void)U;
4799      DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
4800      return;
4801    }
4802    // Bail out if we have a PHI on an EHPad that gets a value from a
4803    // CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
4804    // no good place to stick any instructions.
4805    if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
4806       auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
4807       if (isa<FuncletPadInst>(FirstNonPHI) ||
4808           isa<CatchSwitchInst>(FirstNonPHI))
4809         for (BasicBlock *PredBB : PN->blocks())
4810           if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
4811             return;
4812    }
4813  }
4814
4815#ifndef NDEBUG
4816  // All dominating loops must have preheaders, or SCEVExpander may not be able
4817  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
4818  //
4819  // IVUsers analysis should only create users that are dominated by simple loop
4820  // headers. Since this loop should dominate all of its users, its user list
4821  // should be empty if this loop itself is not within a simple loop nest.
4822  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
4823       Rung; Rung = Rung->getIDom()) {
4824    BasicBlock *BB = Rung->getBlock();
4825    const Loop *DomLoop = LI.getLoopFor(BB);
4826    if (DomLoop && DomLoop->getHeader() == BB) {
4827      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
4828    }
4829  }
4830#endif // DEBUG
4831
4832  DEBUG(dbgs() << "\nLSR on loop ";
4833        L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
4834        dbgs() << ":\n");
4835
4836  // First, perform some low-level loop optimizations.
4837  OptimizeShadowIV();
4838  OptimizeLoopTermCond();
4839
4840  // If loop preparation eliminates all interesting IV users, bail.
4841  if (IU.empty()) return;
4842
4843  // Skip nested loops until we can model them better with formulae.
4844  if (!L->empty()) {
4845    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
4846    return;
4847  }
4848
4849  // Start collecting data and preparing for the solver.
4850  CollectChains();
4851  CollectInterestingTypesAndFactors();
4852  CollectFixupsAndInitialFormulae();
4853  CollectLoopInvariantFixupsAndFormulae();
4854
4855  assert(!Uses.empty() && "IVUsers reported at least one use");
4856  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
4857        print_uses(dbgs()));
4858
4859  // Now use the reuse data to generate a bunch of interesting ways
4860  // to formulate the values needed for the uses.
4861  GenerateAllReuseFormulae();
4862
4863  FilterOutUndesirableDedicatedRegisters();
4864  NarrowSearchSpaceUsingHeuristics();
4865
4866  SmallVector<const Formula *, 8> Solution;
4867  Solve(Solution);
4868
4869  // Release memory that is no longer needed.
4870  Factors.clear();
4871  Types.clear();
4872  RegUses.clear();
4873
4874  if (Solution.empty())
4875    return;
4876
4877#ifndef NDEBUG
4878  // Formulae should be legal.
4879  for (const LSRUse &LU : Uses) {
4880    for (const Formula &F : LU.Formulae)
4881      assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4882                        F) && "Illegal formula generated!");
4883  };
4884#endif
4885
4886  // Now that we've decided what we want, make it so.
4887  ImplementSolution(Solution);
4888}
4889
4890void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
4891  if (Factors.empty() && Types.empty()) return;
4892
4893  OS << "LSR has identified the following interesting factors and types: ";
4894  bool First = true;
4895
4896  for (int64_t Factor : Factors) {
4897    if (!First) OS << ", ";
4898    First = false;
4899    OS << '*' << Factor;
4900  }
4901
4902  for (Type *Ty : Types) {
4903    if (!First) OS << ", ";
4904    First = false;
4905    OS << '(' << *Ty << ')';
4906  }
4907  OS << '\n';
4908}
4909
4910void LSRInstance::print_fixups(raw_ostream &OS) const {
4911  OS << "LSR is examining the following fixup sites:\n";
4912  for (const LSRFixup &LF : Fixups) {
4913    dbgs() << "  ";
4914    LF.print(OS);
4915    OS << '\n';
4916  }
4917}
4918
4919void LSRInstance::print_uses(raw_ostream &OS) const {
4920  OS << "LSR is examining the following uses:\n";
4921  for (const LSRUse &LU : Uses) {
4922    dbgs() << "  ";
4923    LU.print(OS);
4924    OS << '\n';
4925    for (const Formula &F : LU.Formulae) {
4926      OS << "    ";
4927      F.print(OS);
4928      OS << '\n';
4929    }
4930  }
4931}
4932
4933void LSRInstance::print(raw_ostream &OS) const {
4934  print_factors_and_types(OS);
4935  print_fixups(OS);
4936  print_uses(OS);
4937}
4938
4939LLVM_DUMP_METHOD
4940void LSRInstance::dump() const {
4941  print(errs()); errs() << '\n';
4942}
4943
4944namespace {
4945
4946class LoopStrengthReduce : public LoopPass {
4947public:
4948  static char ID; // Pass ID, replacement for typeid
4949  LoopStrengthReduce();
4950
4951private:
4952  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
4953  void getAnalysisUsage(AnalysisUsage &AU) const override;
4954};
4955
4956}
4957
4958char LoopStrengthReduce::ID = 0;
4959INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
4960                "Loop Strength Reduction", false, false)
4961INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
4962INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
4963INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
4964INITIALIZE_PASS_DEPENDENCY(IVUsers)
4965INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
4966INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4967INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
4968                "Loop Strength Reduction", false, false)
4969
4970
4971Pass *llvm::createLoopStrengthReducePass() {
4972  return new LoopStrengthReduce();
4973}
4974
4975LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
4976  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
4977}
4978
4979void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
4980  // We split critical edges, so we change the CFG.  However, we do update
4981  // many analyses if they are around.
4982  AU.addPreservedID(LoopSimplifyID);
4983
4984  AU.addRequired<LoopInfoWrapperPass>();
4985  AU.addPreserved<LoopInfoWrapperPass>();
4986  AU.addRequiredID(LoopSimplifyID);
4987  AU.addRequired<DominatorTreeWrapperPass>();
4988  AU.addPreserved<DominatorTreeWrapperPass>();
4989  AU.addRequired<ScalarEvolutionWrapperPass>();
4990  AU.addPreserved<ScalarEvolutionWrapperPass>();
4991  // Requiring LoopSimplify a second time here prevents IVUsers from running
4992  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
4993  AU.addRequiredID(LoopSimplifyID);
4994  AU.addRequired<IVUsers>();
4995  AU.addPreserved<IVUsers>();
4996  AU.addRequired<TargetTransformInfoWrapperPass>();
4997}
4998
4999bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
5000  if (skipOptnoneFunction(L))
5001    return false;
5002
5003  auto &IU = getAnalysis<IVUsers>();
5004  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
5005  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5006  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
5007  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
5008      *L->getHeader()->getParent());
5009  bool Changed = false;
5010
5011  // Run the main LSR transformation.
5012  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
5013
5014  // Remove any extra phis created by processing inner loops.
5015  Changed |= DeleteDeadPHIs(L->getHeader());
5016  if (EnablePhiElim && L->isLoopSimplifyForm()) {
5017    SmallVector<WeakVH, 16> DeadInsts;
5018    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
5019    SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL,
5020                          "lsr");
5021#ifndef NDEBUG
5022    Rewriter.setDebugType(DEBUG_TYPE);
5023#endif
5024    unsigned numFolded = Rewriter.replaceCongruentIVs(
5025        L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
5026        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
5027            *L->getHeader()->getParent()));
5028    if (numFolded) {
5029      Changed = true;
5030      DeleteTriviallyDeadInstructions(DeadInsts);
5031      DeleteDeadPHIs(L->getHeader());
5032    }
5033  }
5034  return Changed;
5035}
5036