1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21//    of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23//    widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25//    of vectorization. It decides on the optimal vector width, which
26//    can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46//  Data for SIMD
47//
48// Other ideas/concepts are from:
49//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52//  Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SetVector.h"
73#include "llvm/ADT/SmallPtrSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/MemorySSA.h"
91#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92#include "llvm/Analysis/ProfileSummaryInfo.h"
93#include "llvm/Analysis/ScalarEvolution.h"
94#include "llvm/Analysis/ScalarEvolutionExpander.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/TargetLibraryInfo.h"
97#include "llvm/Analysis/TargetTransformInfo.h"
98#include "llvm/Analysis/VectorUtils.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfoMetadata.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/DiagnosticInfo.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
115#include "llvm/IR/IntrinsicInst.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/LLVMContext.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/Type.h"
122#include "llvm/IR/Use.h"
123#include "llvm/IR/User.h"
124#include "llvm/IR/Value.h"
125#include "llvm/IR/ValueHandle.h"
126#include "llvm/IR/Verifier.h"
127#include "llvm/InitializePasses.h"
128#include "llvm/Pass.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Compiler.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/raw_ostream.h"
136#include "llvm/Transforms/Utils/BasicBlockUtils.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/SizeOpts.h"
141#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142#include <algorithm>
143#include <cassert>
144#include <cstdint>
145#include <cstdlib>
146#include <functional>
147#include <iterator>
148#include <limits>
149#include <memory>
150#include <string>
151#include <tuple>
152#include <utility>
153
154using namespace llvm;
155
156#define LV_NAME "loop-vectorize"
157#define DEBUG_TYPE LV_NAME
158
159/// @{
160/// Metadata attribute names
161static const char *const LLVMLoopVectorizeFollowupAll =
162    "llvm.loop.vectorize.followup_all";
163static const char *const LLVMLoopVectorizeFollowupVectorized =
164    "llvm.loop.vectorize.followup_vectorized";
165static const char *const LLVMLoopVectorizeFollowupEpilogue =
166    "llvm.loop.vectorize.followup_epilogue";
167/// @}
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171
172/// Loops with a known constant trip count below this number are vectorized only
173/// if no scalar iteration overheads are incurred.
174static cl::opt<unsigned> TinyTripCountVectorThreshold(
175    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176    cl::desc("Loops with a constant trip count that is smaller than this "
177             "value are vectorized only if no scalar iteration overheads "
178             "are incurred."));
179
180// Indicates that an epilogue is undesired, predication is preferred.
181// This means that the vectorizer will try to fold the loop-tail (epilogue)
182// into the loop and predicate the loop body accordingly.
183static cl::opt<bool> PreferPredicateOverEpilog(
184    "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185    cl::desc("Indicate that an epilogue is undesired, predication should be "
186             "used instead."));
187
188static cl::opt<bool> MaximizeBandwidth(
189    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190    cl::desc("Maximize bandwidth when selecting vectorization factor which "
191             "will be determined by the smallest type in loop."));
192
193static cl::opt<bool> EnableInterleavedMemAccesses(
194    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196
197/// An interleave-group may need masking if it resides in a block that needs
198/// predication, or in order to mask away gaps.
199static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202
203static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204    "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205    cl::desc("We don't interleave loops with a estimated constant trip count "
206             "below this number"));
207
208static cl::opt<unsigned> ForceTargetNumScalarRegs(
209    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210    cl::desc("A flag that overrides the target's number of scalar registers."));
211
212static cl::opt<unsigned> ForceTargetNumVectorRegs(
213    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214    cl::desc("A flag that overrides the target's number of vector registers."));
215
216static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218    cl::desc("A flag that overrides the target's max interleave factor for "
219             "scalar loops."));
220
221static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223    cl::desc("A flag that overrides the target's max interleave factor for "
224             "vectorized loops."));
225
226static cl::opt<unsigned> ForceTargetInstructionCost(
227    "force-target-instruction-cost", cl::init(0), cl::Hidden,
228    cl::desc("A flag that overrides the target's expected cost for "
229             "an instruction to a single constant value. Mostly "
230             "useful for getting consistent testing."));
231
232static cl::opt<unsigned> SmallLoopCost(
233    "small-loop-cost", cl::init(20), cl::Hidden,
234    cl::desc(
235        "The cost of a loop that is considered 'small' by the interleaver."));
236
237static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239    cl::desc("Enable the use of the block frequency analysis to access PGO "
240             "heuristics minimizing code growth in cold regions and being more "
241             "aggressive in hot regions."));
242
243// Runtime interleave loops for load/store throughput.
244static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246    cl::desc(
247        "Enable runtime interleaving until load/store ports are saturated"));
248
249/// The number of stores in a loop that are allowed to need predication.
250static cl::opt<unsigned> NumberOfStoresToPredicate(
251    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252    cl::desc("Max number of stores to be predicated behind an if."));
253
254static cl::opt<bool> EnableIndVarRegisterHeur(
255    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256    cl::desc("Count the induction variable only once when interleaving"));
257
258static cl::opt<bool> EnableCondStoresVectorization(
259    "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260    cl::desc("Enable if predication of stores during vectorization."));
261
262static cl::opt<unsigned> MaxNestedScalarReductionIC(
263    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264    cl::desc("The maximum interleave count to use when interleaving a scalar "
265             "reduction in a nested loop."));
266
267cl::opt<bool> EnableVPlanNativePath(
268    "enable-vplan-native-path", cl::init(false), cl::Hidden,
269    cl::desc("Enable VPlan-native vectorization path with "
270             "support for outer loop vectorization."));
271
272// FIXME: Remove this switch once we have divergence analysis. Currently we
273// assume divergent non-backedge branches when this switch is true.
274cl::opt<bool> EnableVPlanPredication(
275    "enable-vplan-predication", cl::init(false), cl::Hidden,
276    cl::desc("Enable VPlan-native vectorization path predicator with "
277             "support for outer loop vectorization."));
278
279// This flag enables the stress testing of the VPlan H-CFG construction in the
280// VPlan-native vectorization path. It must be used in conjuction with
281// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282// verification of the H-CFGs built.
283static cl::opt<bool> VPlanBuildStressTest(
284    "vplan-build-stress-test", cl::init(false), cl::Hidden,
285    cl::desc(
286        "Build VPlan for every supported loop nest in the function and bail "
287        "out right after the build (stress test the VPlan H-CFG construction "
288        "in the VPlan-native vectorization path)."));
289
290cl::opt<bool> llvm::EnableLoopInterleaving(
291    "interleave-loops", cl::init(true), cl::Hidden,
292    cl::desc("Enable loop interleaving in Loop vectorization passes"));
293cl::opt<bool> llvm::EnableLoopVectorization(
294    "vectorize-loops", cl::init(true), cl::Hidden,
295    cl::desc("Run the Loop vectorization passes"));
296
297/// A helper function for converting Scalar types to vector types.
298/// If the incoming type is void, we return void. If the VF is 1, we return
299/// the scalar type.
300static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301  if (Scalar->isVoidTy() || VF == 1)
302    return Scalar;
303  return VectorType::get(Scalar, VF);
304}
305
306/// A helper function that returns the type of loaded or stored value.
307static Type *getMemInstValueType(Value *I) {
308  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309         "Expected Load or Store instruction");
310  if (auto *LI = dyn_cast<LoadInst>(I))
311    return LI->getType();
312  return cast<StoreInst>(I)->getValueOperand()->getType();
313}
314
315/// A helper function that returns true if the given type is irregular. The
316/// type is irregular if its allocated size doesn't equal the store size of an
317/// element of the corresponding vector type at the given vectorization factor.
318static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319  // Determine if an array of VF elements of type Ty is "bitcast compatible"
320  // with a <VF x Ty> vector.
321  if (VF > 1) {
322    auto *VectorTy = VectorType::get(Ty, VF);
323    return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324  }
325
326  // If the vectorization factor is one, we just check if an array of type Ty
327  // requires padding between elements.
328  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329}
330
331/// A helper function that returns the reciprocal of the block probability of
332/// predicated blocks. If we return X, we are assuming the predicated block
333/// will execute once for every X iterations of the loop header.
334///
335/// TODO: We should use actual block probability here, if available. Currently,
336///       we always assume predicated blocks have a 50% chance of executing.
337static unsigned getReciprocalPredBlockProb() { return 2; }
338
339/// A helper function that adds a 'fast' flag to floating-point operations.
340static Value *addFastMathFlag(Value *V) {
341  if (isa<FPMathOperator>(V))
342    cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343  return V;
344}
345
346static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347  if (isa<FPMathOperator>(V))
348    cast<Instruction>(V)->setFastMathFlags(FMF);
349  return V;
350}
351
352/// A helper function that returns an integer or floating-point constant with
353/// value C.
354static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                           : ConstantFP::get(Ty, C);
357}
358
359/// Returns "best known" trip count for the specified loop \p L as defined by
360/// the following procedure:
361///   1) Returns exact trip count if it is known.
362///   2) Returns expected trip count according to profile data if any.
363///   3) Returns upper bound estimate if it is known.
364///   4) Returns None if all of the above failed.
365static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366  // Check if exact trip count is known.
367  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368    return ExpectedTC;
369
370  // Check if there is an expected trip count available from profile data.
371  if (LoopVectorizeWithBlockFrequency)
372    if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373      return EstimatedTC;
374
375  // Check if upper bound estimate is known.
376  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377    return ExpectedTC;
378
379  return None;
380}
381
382namespace llvm {
383
384/// InnerLoopVectorizer vectorizes loops which contain only one basic
385/// block to a specified vectorization factor (VF).
386/// This class performs the widening of scalars into vectors, or multiple
387/// scalars. This class also implements the following features:
388/// * It inserts an epilogue loop for handling loops that don't have iteration
389///   counts that are known to be a multiple of the vectorization factor.
390/// * It handles the code generation for reduction variables.
391/// * Scalarization (implementation using scalars) of un-vectorizable
392///   instructions.
393/// InnerLoopVectorizer does not perform any vectorization-legality
394/// checks, and relies on the caller to check for the different legality
395/// aspects. The InnerLoopVectorizer relies on the
396/// LoopVectorizationLegality class to provide information about the induction
397/// and reduction variables that were found to a given vectorization factor.
398class InnerLoopVectorizer {
399public:
400  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                      LoopInfo *LI, DominatorTree *DT,
402                      const TargetLibraryInfo *TLI,
403                      const TargetTransformInfo *TTI, AssumptionCache *AC,
404                      OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                      unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                      LoopVectorizationCostModel *CM)
407      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409        Builder(PSE.getSE()->getContext()),
410        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411  virtual ~InnerLoopVectorizer() = default;
412
413  /// Create a new empty loop. Unlink the old loop and connect the new one.
414  /// Return the pre-header block of the new loop.
415  BasicBlock *createVectorizedLoopSkeleton();
416
417  /// Widen a single instruction within the innermost loop.
418  void widenInstruction(Instruction &I);
419
420  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421  void fixVectorizedLoop();
422
423  // Return true if any runtime check is added.
424  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425
426  /// A type for vectorized values in the new loop. Each value from the
427  /// original loop, when vectorized, is represented by UF vector values in the
428  /// new unrolled loop, where UF is the unroll factor.
429  using VectorParts = SmallVector<Value *, 2>;
430
431  /// Vectorize a single GetElementPtrInst based on information gathered and
432  /// decisions taken during planning.
433  void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435
436  /// Vectorize a single PHINode in a block. This method handles the induction
437  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438  /// arbitrary length vectors.
439  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440
441  /// A helper function to scalarize a single Instruction in the innermost loop.
442  /// Generates a sequence of scalar instances for each lane between \p MinLane
443  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444  /// inclusive..
445  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                            bool IfPredicateInstr);
447
448  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449  /// is provided, the integer induction variable will first be truncated to
450  /// the corresponding type.
451  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452
453  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454  /// vector or scalar value on-demand if one is not yet available. When
455  /// vectorizing a loop, we visit the definition of an instruction before its
456  /// uses. When visiting the definition, we either vectorize or scalarize the
457  /// instruction, creating an entry for it in the corresponding map. (In some
458  /// cases, such as induction variables, we will create both vector and scalar
459  /// entries.) Then, as we encounter uses of the definition, we derive values
460  /// for each scalar or vector use unless such a value is already available.
461  /// For example, if we scalarize a definition and one of its uses is vector,
462  /// we build the required vector on-demand with an insertelement sequence
463  /// when visiting the use. Otherwise, if the use is scalar, we can use the
464  /// existing scalar definition.
465  ///
466  /// Return a value in the new loop corresponding to \p V from the original
467  /// loop at unroll index \p Part. If the value has already been vectorized,
468  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470  /// a new vector value on-demand by inserting the scalar values into a vector
471  /// with an insertelement sequence. If the value has been neither vectorized
472  /// nor scalarized, it must be loop invariant, so we simply broadcast the
473  /// value into a vector.
474  Value *getOrCreateVectorValue(Value *V, unsigned Part);
475
476  /// Return a value in the new loop corresponding to \p V from the original
477  /// loop at unroll and vector indices \p Instance. If the value has been
478  /// vectorized but not scalarized, the necessary extractelement instruction
479  /// will be generated.
480  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481
482  /// Construct the vector value of a scalarized value \p V one lane at a time.
483  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484
485  /// Try to vectorize the interleaved access group that \p Instr belongs to
486  /// with the base address given in \p Addr, optionally masking the vector
487  /// operations if \p BlockInMask is non-null. Use \p State to translate given
488  /// VPValues to IR values in the vectorized loop.
489  void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
490                                VPValue *Addr, VPValue *BlockInMask = nullptr);
491
492  /// Vectorize Load and Store instructions with the base address given in \p
493  /// Addr, optionally masking the vector operations if \p BlockInMask is
494  /// non-null. Use \p State to translate given VPValues to IR values in the
495  /// vectorized loop.
496  void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
497                                  VPValue *Addr,
498                                  VPValue *BlockInMask = nullptr);
499
500  /// Set the debug location in the builder using the debug location in
501  /// the instruction.
502  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
503
504  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
505  void fixNonInductionPHIs(void);
506
507protected:
508  friend class LoopVectorizationPlanner;
509
510  /// A small list of PHINodes.
511  using PhiVector = SmallVector<PHINode *, 4>;
512
513  /// A type for scalarized values in the new loop. Each value from the
514  /// original loop, when scalarized, is represented by UF x VF scalar values
515  /// in the new unrolled loop, where UF is the unroll factor and VF is the
516  /// vectorization factor.
517  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
518
519  /// Set up the values of the IVs correctly when exiting the vector loop.
520  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
521                    Value *CountRoundDown, Value *EndValue,
522                    BasicBlock *MiddleBlock);
523
524  /// Create a new induction variable inside L.
525  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
526                                   Value *Step, Instruction *DL);
527
528  /// Handle all cross-iteration phis in the header.
529  void fixCrossIterationPHIs();
530
531  /// Fix a first-order recurrence. This is the second phase of vectorizing
532  /// this phi node.
533  void fixFirstOrderRecurrence(PHINode *Phi);
534
535  /// Fix a reduction cross-iteration phi. This is the second phase of
536  /// vectorizing this phi node.
537  void fixReduction(PHINode *Phi);
538
539  /// Clear NSW/NUW flags from reduction instructions if necessary.
540  void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
541
542  /// The Loop exit block may have single value PHI nodes with some
543  /// incoming value. While vectorizing we only handled real values
544  /// that were defined inside the loop and we should have one value for
545  /// each predecessor of its parent basic block. See PR14725.
546  void fixLCSSAPHIs();
547
548  /// Iteratively sink the scalarized operands of a predicated instruction into
549  /// the block that was created for it.
550  void sinkScalarOperands(Instruction *PredInst);
551
552  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
553  /// represented as.
554  void truncateToMinimalBitwidths();
555
556  /// Create a broadcast instruction. This method generates a broadcast
557  /// instruction (shuffle) for loop invariant values and for the induction
558  /// value. If this is the induction variable then we extend it to N, N+1, ...
559  /// this is needed because each iteration in the loop corresponds to a SIMD
560  /// element.
561  virtual Value *getBroadcastInstrs(Value *V);
562
563  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
564  /// to each vector element of Val. The sequence starts at StartIndex.
565  /// \p Opcode is relevant for FP induction variable.
566  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
567                               Instruction::BinaryOps Opcode =
568                               Instruction::BinaryOpsEnd);
569
570  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
571  /// variable on which to base the steps, \p Step is the size of the step, and
572  /// \p EntryVal is the value from the original loop that maps to the steps.
573  /// Note that \p EntryVal doesn't have to be an induction variable - it
574  /// can also be a truncate instruction.
575  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
576                        const InductionDescriptor &ID);
577
578  /// Create a vector induction phi node based on an existing scalar one. \p
579  /// EntryVal is the value from the original loop that maps to the vector phi
580  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
581  /// truncate instruction, instead of widening the original IV, we widen a
582  /// version of the IV truncated to \p EntryVal's type.
583  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
584                                       Value *Step, Instruction *EntryVal);
585
586  /// Returns true if an instruction \p I should be scalarized instead of
587  /// vectorized for the chosen vectorization factor.
588  bool shouldScalarizeInstruction(Instruction *I) const;
589
590  /// Returns true if we should generate a scalar version of \p IV.
591  bool needsScalarInduction(Instruction *IV) const;
592
593  /// If there is a cast involved in the induction variable \p ID, which should
594  /// be ignored in the vectorized loop body, this function records the
595  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
596  /// cast. We had already proved that the casted Phi is equal to the uncasted
597  /// Phi in the vectorized loop (under a runtime guard), and therefore
598  /// there is no need to vectorize the cast - the same value can be used in the
599  /// vector loop for both the Phi and the cast.
600  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
601  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
602  ///
603  /// \p EntryVal is the value from the original loop that maps to the vector
604  /// phi node and is used to distinguish what is the IV currently being
605  /// processed - original one (if \p EntryVal is a phi corresponding to the
606  /// original IV) or the "newly-created" one based on the proof mentioned above
607  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
608  /// latter case \p EntryVal is a TruncInst and we must not record anything for
609  /// that IV, but it's error-prone to expect callers of this routine to care
610  /// about that, hence this explicit parameter.
611  void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
612                                             const Instruction *EntryVal,
613                                             Value *VectorLoopValue,
614                                             unsigned Part,
615                                             unsigned Lane = UINT_MAX);
616
617  /// Generate a shuffle sequence that will reverse the vector Vec.
618  virtual Value *reverseVector(Value *Vec);
619
620  /// Returns (and creates if needed) the original loop trip count.
621  Value *getOrCreateTripCount(Loop *NewLoop);
622
623  /// Returns (and creates if needed) the trip count of the widened loop.
624  Value *getOrCreateVectorTripCount(Loop *NewLoop);
625
626  /// Returns a bitcasted value to the requested vector type.
627  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
628  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
629                                const DataLayout &DL);
630
631  /// Emit a bypass check to see if the vector trip count is zero, including if
632  /// it overflows.
633  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
634
635  /// Emit a bypass check to see if all of the SCEV assumptions we've
636  /// had to make are correct.
637  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
638
639  /// Emit bypass checks to check any memory assumptions we may have made.
640  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
641
642  /// Compute the transformed value of Index at offset StartValue using step
643  /// StepValue.
644  /// For integer induction, returns StartValue + Index * StepValue.
645  /// For pointer induction, returns StartValue[Index * StepValue].
646  /// FIXME: The newly created binary instructions should contain nsw/nuw
647  /// flags, which can be found from the original scalar operations.
648  Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
649                              const DataLayout &DL,
650                              const InductionDescriptor &ID) const;
651
652  /// Add additional metadata to \p To that was not present on \p Orig.
653  ///
654  /// Currently this is used to add the noalias annotations based on the
655  /// inserted memchecks.  Use this for instructions that are *cloned* into the
656  /// vector loop.
657  void addNewMetadata(Instruction *To, const Instruction *Orig);
658
659  /// Add metadata from one instruction to another.
660  ///
661  /// This includes both the original MDs from \p From and additional ones (\see
662  /// addNewMetadata).  Use this for *newly created* instructions in the vector
663  /// loop.
664  void addMetadata(Instruction *To, Instruction *From);
665
666  /// Similar to the previous function but it adds the metadata to a
667  /// vector of instructions.
668  void addMetadata(ArrayRef<Value *> To, Instruction *From);
669
670  /// The original loop.
671  Loop *OrigLoop;
672
673  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
674  /// dynamic knowledge to simplify SCEV expressions and converts them to a
675  /// more usable form.
676  PredicatedScalarEvolution &PSE;
677
678  /// Loop Info.
679  LoopInfo *LI;
680
681  /// Dominator Tree.
682  DominatorTree *DT;
683
684  /// Alias Analysis.
685  AliasAnalysis *AA;
686
687  /// Target Library Info.
688  const TargetLibraryInfo *TLI;
689
690  /// Target Transform Info.
691  const TargetTransformInfo *TTI;
692
693  /// Assumption Cache.
694  AssumptionCache *AC;
695
696  /// Interface to emit optimization remarks.
697  OptimizationRemarkEmitter *ORE;
698
699  /// LoopVersioning.  It's only set up (non-null) if memchecks were
700  /// used.
701  ///
702  /// This is currently only used to add no-alias metadata based on the
703  /// memchecks.  The actually versioning is performed manually.
704  std::unique_ptr<LoopVersioning> LVer;
705
706  /// The vectorization SIMD factor to use. Each vector will have this many
707  /// vector elements.
708  unsigned VF;
709
710  /// The vectorization unroll factor to use. Each scalar is vectorized to this
711  /// many different vector instructions.
712  unsigned UF;
713
714  /// The builder that we use
715  IRBuilder<> Builder;
716
717  // --- Vectorization state ---
718
719  /// The vector-loop preheader.
720  BasicBlock *LoopVectorPreHeader;
721
722  /// The scalar-loop preheader.
723  BasicBlock *LoopScalarPreHeader;
724
725  /// Middle Block between the vector and the scalar.
726  BasicBlock *LoopMiddleBlock;
727
728  /// The ExitBlock of the scalar loop.
729  BasicBlock *LoopExitBlock;
730
731  /// The vector loop body.
732  BasicBlock *LoopVectorBody;
733
734  /// The scalar loop body.
735  BasicBlock *LoopScalarBody;
736
737  /// A list of all bypass blocks. The first block is the entry of the loop.
738  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
739
740  /// The new Induction variable which was added to the new block.
741  PHINode *Induction = nullptr;
742
743  /// The induction variable of the old basic block.
744  PHINode *OldInduction = nullptr;
745
746  /// Maps values from the original loop to their corresponding values in the
747  /// vectorized loop. A key value can map to either vector values, scalar
748  /// values or both kinds of values, depending on whether the key was
749  /// vectorized and scalarized.
750  VectorizerValueMap VectorLoopValueMap;
751
752  /// Store instructions that were predicated.
753  SmallVector<Instruction *, 4> PredicatedInstructions;
754
755  /// Trip count of the original loop.
756  Value *TripCount = nullptr;
757
758  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
759  Value *VectorTripCount = nullptr;
760
761  /// The legality analysis.
762  LoopVectorizationLegality *Legal;
763
764  /// The profitablity analysis.
765  LoopVectorizationCostModel *Cost;
766
767  // Record whether runtime checks are added.
768  bool AddedSafetyChecks = false;
769
770  // Holds the end values for each induction variable. We save the end values
771  // so we can later fix-up the external users of the induction variables.
772  DenseMap<PHINode *, Value *> IVEndValues;
773
774  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
775  // fixed up at the end of vector code generation.
776  SmallVector<PHINode *, 8> OrigPHIsToFix;
777};
778
779class InnerLoopUnroller : public InnerLoopVectorizer {
780public:
781  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
782                    LoopInfo *LI, DominatorTree *DT,
783                    const TargetLibraryInfo *TLI,
784                    const TargetTransformInfo *TTI, AssumptionCache *AC,
785                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
786                    LoopVectorizationLegality *LVL,
787                    LoopVectorizationCostModel *CM)
788      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
789                            UnrollFactor, LVL, CM) {}
790
791private:
792  Value *getBroadcastInstrs(Value *V) override;
793  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
794                       Instruction::BinaryOps Opcode =
795                       Instruction::BinaryOpsEnd) override;
796  Value *reverseVector(Value *Vec) override;
797};
798
799} // end namespace llvm
800
801/// Look for a meaningful debug location on the instruction or it's
802/// operands.
803static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
804  if (!I)
805    return I;
806
807  DebugLoc Empty;
808  if (I->getDebugLoc() != Empty)
809    return I;
810
811  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
812    if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
813      if (OpInst->getDebugLoc() != Empty)
814        return OpInst;
815  }
816
817  return I;
818}
819
820void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
821  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
822    const DILocation *DIL = Inst->getDebugLoc();
823    if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
824        !isa<DbgInfoIntrinsic>(Inst)) {
825      auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
826      if (NewDIL)
827        B.SetCurrentDebugLocation(NewDIL.getValue());
828      else
829        LLVM_DEBUG(dbgs()
830                   << "Failed to create new discriminator: "
831                   << DIL->getFilename() << " Line: " << DIL->getLine());
832    }
833    else
834      B.SetCurrentDebugLocation(DIL);
835  } else
836    B.SetCurrentDebugLocation(DebugLoc());
837}
838
839/// Write a record \p DebugMsg about vectorization failure to the debug
840/// output stream. If \p I is passed, it is an instruction that prevents
841/// vectorization.
842#ifndef NDEBUG
843static void debugVectorizationFailure(const StringRef DebugMsg,
844    Instruction *I) {
845  dbgs() << "LV: Not vectorizing: " << DebugMsg;
846  if (I != nullptr)
847    dbgs() << " " << *I;
848  else
849    dbgs() << '.';
850  dbgs() << '\n';
851}
852#endif
853
854/// Create an analysis remark that explains why vectorization failed
855///
856/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
857/// RemarkName is the identifier for the remark.  If \p I is passed it is an
858/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
859/// the location of the remark.  \return the remark object that can be
860/// streamed to.
861static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
862    StringRef RemarkName, Loop *TheLoop, Instruction *I) {
863  Value *CodeRegion = TheLoop->getHeader();
864  DebugLoc DL = TheLoop->getStartLoc();
865
866  if (I) {
867    CodeRegion = I->getParent();
868    // If there is no debug location attached to the instruction, revert back to
869    // using the loop's.
870    if (I->getDebugLoc())
871      DL = I->getDebugLoc();
872  }
873
874  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
875  R << "loop not vectorized: ";
876  return R;
877}
878
879namespace llvm {
880
881void reportVectorizationFailure(const StringRef DebugMsg,
882    const StringRef OREMsg, const StringRef ORETag,
883    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
884  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
885  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
886  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
887                ORETag, TheLoop, I) << OREMsg);
888}
889
890} // end namespace llvm
891
892#ifndef NDEBUG
893/// \return string containing a file name and a line # for the given loop.
894static std::string getDebugLocString(const Loop *L) {
895  std::string Result;
896  if (L) {
897    raw_string_ostream OS(Result);
898    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
899      LoopDbgLoc.print(OS);
900    else
901      // Just print the module name.
902      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
903    OS.flush();
904  }
905  return Result;
906}
907#endif
908
909void InnerLoopVectorizer::addNewMetadata(Instruction *To,
910                                         const Instruction *Orig) {
911  // If the loop was versioned with memchecks, add the corresponding no-alias
912  // metadata.
913  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
914    LVer->annotateInstWithNoAlias(To, Orig);
915}
916
917void InnerLoopVectorizer::addMetadata(Instruction *To,
918                                      Instruction *From) {
919  propagateMetadata(To, From);
920  addNewMetadata(To, From);
921}
922
923void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
924                                      Instruction *From) {
925  for (Value *V : To) {
926    if (Instruction *I = dyn_cast<Instruction>(V))
927      addMetadata(I, From);
928  }
929}
930
931namespace llvm {
932
933// Loop vectorization cost-model hints how the scalar epilogue loop should be
934// lowered.
935enum ScalarEpilogueLowering {
936
937  // The default: allowing scalar epilogues.
938  CM_ScalarEpilogueAllowed,
939
940  // Vectorization with OptForSize: don't allow epilogues.
941  CM_ScalarEpilogueNotAllowedOptSize,
942
943  // A special case of vectorisation with OptForSize: loops with a very small
944  // trip count are considered for vectorization under OptForSize, thereby
945  // making sure the cost of their loop body is dominant, free of runtime
946  // guards and scalar iteration overheads.
947  CM_ScalarEpilogueNotAllowedLowTripLoop,
948
949  // Loop hint predicate indicating an epilogue is undesired.
950  CM_ScalarEpilogueNotNeededUsePredicate
951};
952
953/// LoopVectorizationCostModel - estimates the expected speedups due to
954/// vectorization.
955/// In many cases vectorization is not profitable. This can happen because of
956/// a number of reasons. In this class we mainly attempt to predict the
957/// expected speedup/slowdowns due to the supported instruction set. We use the
958/// TargetTransformInfo to query the different backends for the cost of
959/// different operations.
960class LoopVectorizationCostModel {
961public:
962  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
963                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
964                             LoopVectorizationLegality *Legal,
965                             const TargetTransformInfo &TTI,
966                             const TargetLibraryInfo *TLI, DemandedBits *DB,
967                             AssumptionCache *AC,
968                             OptimizationRemarkEmitter *ORE, const Function *F,
969                             const LoopVectorizeHints *Hints,
970                             InterleavedAccessInfo &IAI)
971      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
972        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
973        Hints(Hints), InterleaveInfo(IAI) {}
974
975  /// \return An upper bound for the vectorization factor, or None if
976  /// vectorization and interleaving should be avoided up front.
977  Optional<unsigned> computeMaxVF();
978
979  /// \return True if runtime checks are required for vectorization, and false
980  /// otherwise.
981  bool runtimeChecksRequired();
982
983  /// \return The most profitable vectorization factor and the cost of that VF.
984  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
985  /// then this vectorization factor will be selected if vectorization is
986  /// possible.
987  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
988
989  /// Setup cost-based decisions for user vectorization factor.
990  void selectUserVectorizationFactor(unsigned UserVF) {
991    collectUniformsAndScalars(UserVF);
992    collectInstsToScalarize(UserVF);
993  }
994
995  /// \return The size (in bits) of the smallest and widest types in the code
996  /// that needs to be vectorized. We ignore values that remain scalar such as
997  /// 64 bit loop indices.
998  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
999
1000  /// \return The desired interleave count.
1001  /// If interleave count has been specified by metadata it will be returned.
1002  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1003  /// are the selected vectorization factor and the cost of the selected VF.
1004  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1005
1006  /// Memory access instruction may be vectorized in more than one way.
1007  /// Form of instruction after vectorization depends on cost.
1008  /// This function takes cost-based decisions for Load/Store instructions
1009  /// and collects them in a map. This decisions map is used for building
1010  /// the lists of loop-uniform and loop-scalar instructions.
1011  /// The calculated cost is saved with widening decision in order to
1012  /// avoid redundant calculations.
1013  void setCostBasedWideningDecision(unsigned VF);
1014
1015  /// A struct that represents some properties of the register usage
1016  /// of a loop.
1017  struct RegisterUsage {
1018    /// Holds the number of loop invariant values that are used in the loop.
1019    /// The key is ClassID of target-provided register class.
1020    SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1021    /// Holds the maximum number of concurrent live intervals in the loop.
1022    /// The key is ClassID of target-provided register class.
1023    SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1024  };
1025
1026  /// \return Returns information about the register usages of the loop for the
1027  /// given vectorization factors.
1028  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1029
1030  /// Collect values we want to ignore in the cost model.
1031  void collectValuesToIgnore();
1032
1033  /// \returns The smallest bitwidth each instruction can be represented with.
1034  /// The vector equivalents of these instructions should be truncated to this
1035  /// type.
1036  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1037    return MinBWs;
1038  }
1039
1040  /// \returns True if it is more profitable to scalarize instruction \p I for
1041  /// vectorization factor \p VF.
1042  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1043    assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1044
1045    // Cost model is not run in the VPlan-native path - return conservative
1046    // result until this changes.
1047    if (EnableVPlanNativePath)
1048      return false;
1049
1050    auto Scalars = InstsToScalarize.find(VF);
1051    assert(Scalars != InstsToScalarize.end() &&
1052           "VF not yet analyzed for scalarization profitability");
1053    return Scalars->second.find(I) != Scalars->second.end();
1054  }
1055
1056  /// Returns true if \p I is known to be uniform after vectorization.
1057  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1058    if (VF == 1)
1059      return true;
1060
1061    // Cost model is not run in the VPlan-native path - return conservative
1062    // result until this changes.
1063    if (EnableVPlanNativePath)
1064      return false;
1065
1066    auto UniformsPerVF = Uniforms.find(VF);
1067    assert(UniformsPerVF != Uniforms.end() &&
1068           "VF not yet analyzed for uniformity");
1069    return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1070  }
1071
1072  /// Returns true if \p I is known to be scalar after vectorization.
1073  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1074    if (VF == 1)
1075      return true;
1076
1077    // Cost model is not run in the VPlan-native path - return conservative
1078    // result until this changes.
1079    if (EnableVPlanNativePath)
1080      return false;
1081
1082    auto ScalarsPerVF = Scalars.find(VF);
1083    assert(ScalarsPerVF != Scalars.end() &&
1084           "Scalar values are not calculated for VF");
1085    return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1086  }
1087
1088  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1089  /// for vectorization factor \p VF.
1090  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1091    return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1092           !isProfitableToScalarize(I, VF) &&
1093           !isScalarAfterVectorization(I, VF);
1094  }
1095
1096  /// Decision that was taken during cost calculation for memory instruction.
1097  enum InstWidening {
1098    CM_Unknown,
1099    CM_Widen,         // For consecutive accesses with stride +1.
1100    CM_Widen_Reverse, // For consecutive accesses with stride -1.
1101    CM_Interleave,
1102    CM_GatherScatter,
1103    CM_Scalarize
1104  };
1105
1106  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107  /// instruction \p I and vector width \p VF.
1108  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1109                           unsigned Cost) {
1110    assert(VF >= 2 && "Expected VF >=2");
1111    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1112  }
1113
1114  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1115  /// interleaving group \p Grp and vector width \p VF.
1116  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1117                           InstWidening W, unsigned Cost) {
1118    assert(VF >= 2 && "Expected VF >=2");
1119    /// Broadcast this decicion to all instructions inside the group.
1120    /// But the cost will be assigned to one instruction only.
1121    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1122      if (auto *I = Grp->getMember(i)) {
1123        if (Grp->getInsertPos() == I)
1124          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1125        else
1126          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1127      }
1128    }
1129  }
1130
1131  /// Return the cost model decision for the given instruction \p I and vector
1132  /// width \p VF. Return CM_Unknown if this instruction did not pass
1133  /// through the cost modeling.
1134  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1135    assert(VF >= 2 && "Expected VF >=2");
1136
1137    // Cost model is not run in the VPlan-native path - return conservative
1138    // result until this changes.
1139    if (EnableVPlanNativePath)
1140      return CM_GatherScatter;
1141
1142    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1143    auto Itr = WideningDecisions.find(InstOnVF);
1144    if (Itr == WideningDecisions.end())
1145      return CM_Unknown;
1146    return Itr->second.first;
1147  }
1148
1149  /// Return the vectorization cost for the given instruction \p I and vector
1150  /// width \p VF.
1151  unsigned getWideningCost(Instruction *I, unsigned VF) {
1152    assert(VF >= 2 && "Expected VF >=2");
1153    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1154    assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1155           "The cost is not calculated");
1156    return WideningDecisions[InstOnVF].second;
1157  }
1158
1159  /// Return True if instruction \p I is an optimizable truncate whose operand
1160  /// is an induction variable. Such a truncate will be removed by adding a new
1161  /// induction variable with the destination type.
1162  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1163    // If the instruction is not a truncate, return false.
1164    auto *Trunc = dyn_cast<TruncInst>(I);
1165    if (!Trunc)
1166      return false;
1167
1168    // Get the source and destination types of the truncate.
1169    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1170    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1171
1172    // If the truncate is free for the given types, return false. Replacing a
1173    // free truncate with an induction variable would add an induction variable
1174    // update instruction to each iteration of the loop. We exclude from this
1175    // check the primary induction variable since it will need an update
1176    // instruction regardless.
1177    Value *Op = Trunc->getOperand(0);
1178    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1179      return false;
1180
1181    // If the truncated value is not an induction variable, return false.
1182    return Legal->isInductionPhi(Op);
1183  }
1184
1185  /// Collects the instructions to scalarize for each predicated instruction in
1186  /// the loop.
1187  void collectInstsToScalarize(unsigned VF);
1188
1189  /// Collect Uniform and Scalar values for the given \p VF.
1190  /// The sets depend on CM decision for Load/Store instructions
1191  /// that may be vectorized as interleave, gather-scatter or scalarized.
1192  void collectUniformsAndScalars(unsigned VF) {
1193    // Do the analysis once.
1194    if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1195      return;
1196    setCostBasedWideningDecision(VF);
1197    collectLoopUniforms(VF);
1198    collectLoopScalars(VF);
1199  }
1200
1201  /// Returns true if the target machine supports masked store operation
1202  /// for the given \p DataType and kind of access to \p Ptr.
1203  bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1204    return Legal->isConsecutivePtr(Ptr) &&
1205           TTI.isLegalMaskedStore(DataType, Alignment);
1206  }
1207
1208  /// Returns true if the target machine supports masked load operation
1209  /// for the given \p DataType and kind of access to \p Ptr.
1210  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1211    return Legal->isConsecutivePtr(Ptr) &&
1212           TTI.isLegalMaskedLoad(DataType, Alignment);
1213  }
1214
1215  /// Returns true if the target machine supports masked scatter operation
1216  /// for the given \p DataType.
1217  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1218    return TTI.isLegalMaskedScatter(DataType, Alignment);
1219  }
1220
1221  /// Returns true if the target machine supports masked gather operation
1222  /// for the given \p DataType.
1223  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1224    return TTI.isLegalMaskedGather(DataType, Alignment);
1225  }
1226
1227  /// Returns true if the target machine can represent \p V as a masked gather
1228  /// or scatter operation.
1229  bool isLegalGatherOrScatter(Value *V) {
1230    bool LI = isa<LoadInst>(V);
1231    bool SI = isa<StoreInst>(V);
1232    if (!LI && !SI)
1233      return false;
1234    auto *Ty = getMemInstValueType(V);
1235    MaybeAlign Align = getLoadStoreAlignment(V);
1236    return (LI && isLegalMaskedGather(Ty, Align)) ||
1237           (SI && isLegalMaskedScatter(Ty, Align));
1238  }
1239
1240  /// Returns true if \p I is an instruction that will be scalarized with
1241  /// predication. Such instructions include conditional stores and
1242  /// instructions that may divide by zero.
1243  /// If a non-zero VF has been calculated, we check if I will be scalarized
1244  /// predication for that VF.
1245  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1246
1247  // Returns true if \p I is an instruction that will be predicated either
1248  // through scalar predication or masked load/store or masked gather/scatter.
1249  // Superset of instructions that return true for isScalarWithPredication.
1250  bool isPredicatedInst(Instruction *I) {
1251    if (!blockNeedsPredication(I->getParent()))
1252      return false;
1253    // Loads and stores that need some form of masked operation are predicated
1254    // instructions.
1255    if (isa<LoadInst>(I) || isa<StoreInst>(I))
1256      return Legal->isMaskRequired(I);
1257    return isScalarWithPredication(I);
1258  }
1259
1260  /// Returns true if \p I is a memory instruction with consecutive memory
1261  /// access that can be widened.
1262  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1263
1264  /// Returns true if \p I is a memory instruction in an interleaved-group
1265  /// of memory accesses that can be vectorized with wide vector loads/stores
1266  /// and shuffles.
1267  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1268
1269  /// Check if \p Instr belongs to any interleaved access group.
1270  bool isAccessInterleaved(Instruction *Instr) {
1271    return InterleaveInfo.isInterleaved(Instr);
1272  }
1273
1274  /// Get the interleaved access group that \p Instr belongs to.
1275  const InterleaveGroup<Instruction> *
1276  getInterleavedAccessGroup(Instruction *Instr) {
1277    return InterleaveInfo.getInterleaveGroup(Instr);
1278  }
1279
1280  /// Returns true if an interleaved group requires a scalar iteration
1281  /// to handle accesses with gaps, and there is nothing preventing us from
1282  /// creating a scalar epilogue.
1283  bool requiresScalarEpilogue() const {
1284    return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1285  }
1286
1287  /// Returns true if a scalar epilogue is not allowed due to optsize or a
1288  /// loop hint annotation.
1289  bool isScalarEpilogueAllowed() const {
1290    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1291  }
1292
1293  /// Returns true if all loop blocks should be masked to fold tail loop.
1294  bool foldTailByMasking() const { return FoldTailByMasking; }
1295
1296  bool blockNeedsPredication(BasicBlock *BB) {
1297    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1298  }
1299
1300  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1301  /// with factor VF.  Return the cost of the instruction, including
1302  /// scalarization overhead if it's needed.
1303  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1304
1305  /// Estimate cost of a call instruction CI if it were vectorized with factor
1306  /// VF. Return the cost of the instruction, including scalarization overhead
1307  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1308  /// scalarized -
1309  /// i.e. either vector version isn't available, or is too expensive.
1310  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1311
1312private:
1313  unsigned NumPredStores = 0;
1314
1315  /// \return An upper bound for the vectorization factor, larger than zero.
1316  /// One is returned if vectorization should best be avoided due to cost.
1317  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1318
1319  /// The vectorization cost is a combination of the cost itself and a boolean
1320  /// indicating whether any of the contributing operations will actually
1321  /// operate on
1322  /// vector values after type legalization in the backend. If this latter value
1323  /// is
1324  /// false, then all operations will be scalarized (i.e. no vectorization has
1325  /// actually taken place).
1326  using VectorizationCostTy = std::pair<unsigned, bool>;
1327
1328  /// Returns the expected execution cost. The unit of the cost does
1329  /// not matter because we use the 'cost' units to compare different
1330  /// vector widths. The cost that is returned is *not* normalized by
1331  /// the factor width.
1332  VectorizationCostTy expectedCost(unsigned VF);
1333
1334  /// Returns the execution time cost of an instruction for a given vector
1335  /// width. Vector width of one means scalar.
1336  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1337
1338  /// The cost-computation logic from getInstructionCost which provides
1339  /// the vector type as an output parameter.
1340  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1341
1342  /// Calculate vectorization cost of memory instruction \p I.
1343  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1344
1345  /// The cost computation for scalarized memory instruction.
1346  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1347
1348  /// The cost computation for interleaving group of memory instructions.
1349  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1350
1351  /// The cost computation for Gather/Scatter instruction.
1352  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1353
1354  /// The cost computation for widening instruction \p I with consecutive
1355  /// memory access.
1356  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1357
1358  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1359  /// Load: scalar load + broadcast.
1360  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1361  /// element)
1362  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1363
1364  /// Estimate the overhead of scalarizing an instruction. This is a
1365  /// convenience wrapper for the type-based getScalarizationOverhead API.
1366  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1367
1368  /// Returns whether the instruction is a load or store and will be a emitted
1369  /// as a vector operation.
1370  bool isConsecutiveLoadOrStore(Instruction *I);
1371
1372  /// Returns true if an artificially high cost for emulated masked memrefs
1373  /// should be used.
1374  bool useEmulatedMaskMemRefHack(Instruction *I);
1375
1376  /// Map of scalar integer values to the smallest bitwidth they can be legally
1377  /// represented as. The vector equivalents of these values should be truncated
1378  /// to this type.
1379  MapVector<Instruction *, uint64_t> MinBWs;
1380
1381  /// A type representing the costs for instructions if they were to be
1382  /// scalarized rather than vectorized. The entries are Instruction-Cost
1383  /// pairs.
1384  using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1385
1386  /// A set containing all BasicBlocks that are known to present after
1387  /// vectorization as a predicated block.
1388  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1389
1390  /// Records whether it is allowed to have the original scalar loop execute at
1391  /// least once. This may be needed as a fallback loop in case runtime
1392  /// aliasing/dependence checks fail, or to handle the tail/remainder
1393  /// iterations when the trip count is unknown or doesn't divide by the VF,
1394  /// or as a peel-loop to handle gaps in interleave-groups.
1395  /// Under optsize and when the trip count is very small we don't allow any
1396  /// iterations to execute in the scalar loop.
1397  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1398
1399  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1400  bool FoldTailByMasking = false;
1401
1402  /// A map holding scalar costs for different vectorization factors. The
1403  /// presence of a cost for an instruction in the mapping indicates that the
1404  /// instruction will be scalarized when vectorizing with the associated
1405  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1406  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1407
1408  /// Holds the instructions known to be uniform after vectorization.
1409  /// The data is collected per VF.
1410  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1411
1412  /// Holds the instructions known to be scalar after vectorization.
1413  /// The data is collected per VF.
1414  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1415
1416  /// Holds the instructions (address computations) that are forced to be
1417  /// scalarized.
1418  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1419
1420  /// Returns the expected difference in cost from scalarizing the expression
1421  /// feeding a predicated instruction \p PredInst. The instructions to
1422  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1423  /// non-negative return value implies the expression will be scalarized.
1424  /// Currently, only single-use chains are considered for scalarization.
1425  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1426                              unsigned VF);
1427
1428  /// Collect the instructions that are uniform after vectorization. An
1429  /// instruction is uniform if we represent it with a single scalar value in
1430  /// the vectorized loop corresponding to each vector iteration. Examples of
1431  /// uniform instructions include pointer operands of consecutive or
1432  /// interleaved memory accesses. Note that although uniformity implies an
1433  /// instruction will be scalar, the reverse is not true. In general, a
1434  /// scalarized instruction will be represented by VF scalar values in the
1435  /// vectorized loop, each corresponding to an iteration of the original
1436  /// scalar loop.
1437  void collectLoopUniforms(unsigned VF);
1438
1439  /// Collect the instructions that are scalar after vectorization. An
1440  /// instruction is scalar if it is known to be uniform or will be scalarized
1441  /// during vectorization. Non-uniform scalarized instructions will be
1442  /// represented by VF values in the vectorized loop, each corresponding to an
1443  /// iteration of the original scalar loop.
1444  void collectLoopScalars(unsigned VF);
1445
1446  /// Keeps cost model vectorization decision and cost for instructions.
1447  /// Right now it is used for memory instructions only.
1448  using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1449                                std::pair<InstWidening, unsigned>>;
1450
1451  DecisionList WideningDecisions;
1452
1453  /// Returns true if \p V is expected to be vectorized and it needs to be
1454  /// extracted.
1455  bool needsExtract(Value *V, unsigned VF) const {
1456    Instruction *I = dyn_cast<Instruction>(V);
1457    if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1458      return false;
1459
1460    // Assume we can vectorize V (and hence we need extraction) if the
1461    // scalars are not computed yet. This can happen, because it is called
1462    // via getScalarizationOverhead from setCostBasedWideningDecision, before
1463    // the scalars are collected. That should be a safe assumption in most
1464    // cases, because we check if the operands have vectorizable types
1465    // beforehand in LoopVectorizationLegality.
1466    return Scalars.find(VF) == Scalars.end() ||
1467           !isScalarAfterVectorization(I, VF);
1468  };
1469
1470  /// Returns a range containing only operands needing to be extracted.
1471  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1472                                                   unsigned VF) {
1473    return SmallVector<Value *, 4>(make_filter_range(
1474        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1475  }
1476
1477public:
1478  /// The loop that we evaluate.
1479  Loop *TheLoop;
1480
1481  /// Predicated scalar evolution analysis.
1482  PredicatedScalarEvolution &PSE;
1483
1484  /// Loop Info analysis.
1485  LoopInfo *LI;
1486
1487  /// Vectorization legality.
1488  LoopVectorizationLegality *Legal;
1489
1490  /// Vector target information.
1491  const TargetTransformInfo &TTI;
1492
1493  /// Target Library Info.
1494  const TargetLibraryInfo *TLI;
1495
1496  /// Demanded bits analysis.
1497  DemandedBits *DB;
1498
1499  /// Assumption cache.
1500  AssumptionCache *AC;
1501
1502  /// Interface to emit optimization remarks.
1503  OptimizationRemarkEmitter *ORE;
1504
1505  const Function *TheFunction;
1506
1507  /// Loop Vectorize Hint.
1508  const LoopVectorizeHints *Hints;
1509
1510  /// The interleave access information contains groups of interleaved accesses
1511  /// with the same stride and close to each other.
1512  InterleavedAccessInfo &InterleaveInfo;
1513
1514  /// Values to ignore in the cost model.
1515  SmallPtrSet<const Value *, 16> ValuesToIgnore;
1516
1517  /// Values to ignore in the cost model when VF > 1.
1518  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1519};
1520
1521} // end namespace llvm
1522
1523// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1524// vectorization. The loop needs to be annotated with #pragma omp simd
1525// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1526// vector length information is not provided, vectorization is not considered
1527// explicit. Interleave hints are not allowed either. These limitations will be
1528// relaxed in the future.
1529// Please, note that we are currently forced to abuse the pragma 'clang
1530// vectorize' semantics. This pragma provides *auto-vectorization hints*
1531// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1532// provides *explicit vectorization hints* (LV can bypass legal checks and
1533// assume that vectorization is legal). However, both hints are implemented
1534// using the same metadata (llvm.loop.vectorize, processed by
1535// LoopVectorizeHints). This will be fixed in the future when the native IR
1536// representation for pragma 'omp simd' is introduced.
1537static bool isExplicitVecOuterLoop(Loop *OuterLp,
1538                                   OptimizationRemarkEmitter *ORE) {
1539  assert(!OuterLp->empty() && "This is not an outer loop");
1540  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1541
1542  // Only outer loops with an explicit vectorization hint are supported.
1543  // Unannotated outer loops are ignored.
1544  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1545    return false;
1546
1547  Function *Fn = OuterLp->getHeader()->getParent();
1548  if (!Hints.allowVectorization(Fn, OuterLp,
1549                                true /*VectorizeOnlyWhenForced*/)) {
1550    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1551    return false;
1552  }
1553
1554  if (Hints.getInterleave() > 1) {
1555    // TODO: Interleave support is future work.
1556    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1557                         "outer loops.\n");
1558    Hints.emitRemarkWithHints();
1559    return false;
1560  }
1561
1562  return true;
1563}
1564
1565static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1566                                  OptimizationRemarkEmitter *ORE,
1567                                  SmallVectorImpl<Loop *> &V) {
1568  // Collect inner loops and outer loops without irreducible control flow. For
1569  // now, only collect outer loops that have explicit vectorization hints. If we
1570  // are stress testing the VPlan H-CFG construction, we collect the outermost
1571  // loop of every loop nest.
1572  if (L.empty() || VPlanBuildStressTest ||
1573      (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1574    LoopBlocksRPO RPOT(&L);
1575    RPOT.perform(LI);
1576    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1577      V.push_back(&L);
1578      // TODO: Collect inner loops inside marked outer loops in case
1579      // vectorization fails for the outer loop. Do not invoke
1580      // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1581      // already known to be reducible. We can use an inherited attribute for
1582      // that.
1583      return;
1584    }
1585  }
1586  for (Loop *InnerL : L)
1587    collectSupportedLoops(*InnerL, LI, ORE, V);
1588}
1589
1590namespace {
1591
1592/// The LoopVectorize Pass.
1593struct LoopVectorize : public FunctionPass {
1594  /// Pass identification, replacement for typeid
1595  static char ID;
1596
1597  LoopVectorizePass Impl;
1598
1599  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1600                         bool VectorizeOnlyWhenForced = false)
1601      : FunctionPass(ID) {
1602    Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1603    Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1604    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1605  }
1606
1607  bool runOnFunction(Function &F) override {
1608    if (skipFunction(F))
1609      return false;
1610
1611    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1612    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1613    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1614    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1615    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1616    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1617    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1618    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1619    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1620    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1621    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1622    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1623    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1624
1625    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1626        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1627
1628    return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1629                        GetLAA, *ORE, PSI);
1630  }
1631
1632  void getAnalysisUsage(AnalysisUsage &AU) const override {
1633    AU.addRequired<AssumptionCacheTracker>();
1634    AU.addRequired<BlockFrequencyInfoWrapperPass>();
1635    AU.addRequired<DominatorTreeWrapperPass>();
1636    AU.addRequired<LoopInfoWrapperPass>();
1637    AU.addRequired<ScalarEvolutionWrapperPass>();
1638    AU.addRequired<TargetTransformInfoWrapperPass>();
1639    AU.addRequired<AAResultsWrapperPass>();
1640    AU.addRequired<LoopAccessLegacyAnalysis>();
1641    AU.addRequired<DemandedBitsWrapperPass>();
1642    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1643
1644    // We currently do not preserve loopinfo/dominator analyses with outer loop
1645    // vectorization. Until this is addressed, mark these analyses as preserved
1646    // only for non-VPlan-native path.
1647    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1648    if (!EnableVPlanNativePath) {
1649      AU.addPreserved<LoopInfoWrapperPass>();
1650      AU.addPreserved<DominatorTreeWrapperPass>();
1651    }
1652
1653    AU.addPreserved<BasicAAWrapperPass>();
1654    AU.addPreserved<GlobalsAAWrapperPass>();
1655    AU.addRequired<ProfileSummaryInfoWrapperPass>();
1656  }
1657};
1658
1659} // end anonymous namespace
1660
1661//===----------------------------------------------------------------------===//
1662// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1663// LoopVectorizationCostModel and LoopVectorizationPlanner.
1664//===----------------------------------------------------------------------===//
1665
1666Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1667  // We need to place the broadcast of invariant variables outside the loop,
1668  // but only if it's proven safe to do so. Else, broadcast will be inside
1669  // vector loop body.
1670  Instruction *Instr = dyn_cast<Instruction>(V);
1671  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1672                     (!Instr ||
1673                      DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1674  // Place the code for broadcasting invariant variables in the new preheader.
1675  IRBuilder<>::InsertPointGuard Guard(Builder);
1676  if (SafeToHoist)
1677    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1678
1679  // Broadcast the scalar into all locations in the vector.
1680  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1681
1682  return Shuf;
1683}
1684
1685void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1686    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1687  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1688         "Expected either an induction phi-node or a truncate of it!");
1689  Value *Start = II.getStartValue();
1690
1691  // Construct the initial value of the vector IV in the vector loop preheader
1692  auto CurrIP = Builder.saveIP();
1693  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1694  if (isa<TruncInst>(EntryVal)) {
1695    assert(Start->getType()->isIntegerTy() &&
1696           "Truncation requires an integer type");
1697    auto *TruncType = cast<IntegerType>(EntryVal->getType());
1698    Step = Builder.CreateTrunc(Step, TruncType);
1699    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1700  }
1701  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1702  Value *SteppedStart =
1703      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1704
1705  // We create vector phi nodes for both integer and floating-point induction
1706  // variables. Here, we determine the kind of arithmetic we will perform.
1707  Instruction::BinaryOps AddOp;
1708  Instruction::BinaryOps MulOp;
1709  if (Step->getType()->isIntegerTy()) {
1710    AddOp = Instruction::Add;
1711    MulOp = Instruction::Mul;
1712  } else {
1713    AddOp = II.getInductionOpcode();
1714    MulOp = Instruction::FMul;
1715  }
1716
1717  // Multiply the vectorization factor by the step using integer or
1718  // floating-point arithmetic as appropriate.
1719  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1720  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1721
1722  // Create a vector splat to use in the induction update.
1723  //
1724  // FIXME: If the step is non-constant, we create the vector splat with
1725  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1726  //        handle a constant vector splat.
1727  Value *SplatVF = isa<Constant>(Mul)
1728                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1729                       : Builder.CreateVectorSplat(VF, Mul);
1730  Builder.restoreIP(CurrIP);
1731
1732  // We may need to add the step a number of times, depending on the unroll
1733  // factor. The last of those goes into the PHI.
1734  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1735                                    &*LoopVectorBody->getFirstInsertionPt());
1736  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1737  Instruction *LastInduction = VecInd;
1738  for (unsigned Part = 0; Part < UF; ++Part) {
1739    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1740
1741    if (isa<TruncInst>(EntryVal))
1742      addMetadata(LastInduction, EntryVal);
1743    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1744
1745    LastInduction = cast<Instruction>(addFastMathFlag(
1746        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1747    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1748  }
1749
1750  // Move the last step to the end of the latch block. This ensures consistent
1751  // placement of all induction updates.
1752  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1753  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1754  auto *ICmp = cast<Instruction>(Br->getCondition());
1755  LastInduction->moveBefore(ICmp);
1756  LastInduction->setName("vec.ind.next");
1757
1758  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1759  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1760}
1761
1762bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1763  return Cost->isScalarAfterVectorization(I, VF) ||
1764         Cost->isProfitableToScalarize(I, VF);
1765}
1766
1767bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1768  if (shouldScalarizeInstruction(IV))
1769    return true;
1770  auto isScalarInst = [&](User *U) -> bool {
1771    auto *I = cast<Instruction>(U);
1772    return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1773  };
1774  return llvm::any_of(IV->users(), isScalarInst);
1775}
1776
1777void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1778    const InductionDescriptor &ID, const Instruction *EntryVal,
1779    Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1780  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1781         "Expected either an induction phi-node or a truncate of it!");
1782
1783  // This induction variable is not the phi from the original loop but the
1784  // newly-created IV based on the proof that casted Phi is equal to the
1785  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1786  // re-uses the same InductionDescriptor that original IV uses but we don't
1787  // have to do any recording in this case - that is done when original IV is
1788  // processed.
1789  if (isa<TruncInst>(EntryVal))
1790    return;
1791
1792  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1793  if (Casts.empty())
1794    return;
1795  // Only the first Cast instruction in the Casts vector is of interest.
1796  // The rest of the Casts (if exist) have no uses outside the
1797  // induction update chain itself.
1798  Instruction *CastInst = *Casts.begin();
1799  if (Lane < UINT_MAX)
1800    VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1801  else
1802    VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1803}
1804
1805void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1806  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1807         "Primary induction variable must have an integer type");
1808
1809  auto II = Legal->getInductionVars()->find(IV);
1810  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1811
1812  auto ID = II->second;
1813  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1814
1815  // The scalar value to broadcast. This will be derived from the canonical
1816  // induction variable.
1817  Value *ScalarIV = nullptr;
1818
1819  // The value from the original loop to which we are mapping the new induction
1820  // variable.
1821  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1822
1823  // True if we have vectorized the induction variable.
1824  auto VectorizedIV = false;
1825
1826  // Determine if we want a scalar version of the induction variable. This is
1827  // true if the induction variable itself is not widened, or if it has at
1828  // least one user in the loop that is not widened.
1829  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1830
1831  // Generate code for the induction step. Note that induction steps are
1832  // required to be loop-invariant
1833  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1834         "Induction step should be loop invariant");
1835  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1836  Value *Step = nullptr;
1837  if (PSE.getSE()->isSCEVable(IV->getType())) {
1838    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1839    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1840                             LoopVectorPreHeader->getTerminator());
1841  } else {
1842    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1843  }
1844
1845  // Try to create a new independent vector induction variable. If we can't
1846  // create the phi node, we will splat the scalar induction variable in each
1847  // loop iteration.
1848  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1849    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1850    VectorizedIV = true;
1851  }
1852
1853  // If we haven't yet vectorized the induction variable, or if we will create
1854  // a scalar one, we need to define the scalar induction variable and step
1855  // values. If we were given a truncation type, truncate the canonical
1856  // induction variable and step. Otherwise, derive these values from the
1857  // induction descriptor.
1858  if (!VectorizedIV || NeedsScalarIV) {
1859    ScalarIV = Induction;
1860    if (IV != OldInduction) {
1861      ScalarIV = IV->getType()->isIntegerTy()
1862                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1863                     : Builder.CreateCast(Instruction::SIToFP, Induction,
1864                                          IV->getType());
1865      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1866      ScalarIV->setName("offset.idx");
1867    }
1868    if (Trunc) {
1869      auto *TruncType = cast<IntegerType>(Trunc->getType());
1870      assert(Step->getType()->isIntegerTy() &&
1871             "Truncation requires an integer step");
1872      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1873      Step = Builder.CreateTrunc(Step, TruncType);
1874    }
1875  }
1876
1877  // If we haven't yet vectorized the induction variable, splat the scalar
1878  // induction variable, and build the necessary step vectors.
1879  // TODO: Don't do it unless the vectorized IV is really required.
1880  if (!VectorizedIV) {
1881    Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1882    for (unsigned Part = 0; Part < UF; ++Part) {
1883      Value *EntryPart =
1884          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1885      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1886      if (Trunc)
1887        addMetadata(EntryPart, Trunc);
1888      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1889    }
1890  }
1891
1892  // If an induction variable is only used for counting loop iterations or
1893  // calculating addresses, it doesn't need to be widened. Create scalar steps
1894  // that can be used by instructions we will later scalarize. Note that the
1895  // addition of the scalar steps will not increase the number of instructions
1896  // in the loop in the common case prior to InstCombine. We will be trading
1897  // one vector extract for each scalar step.
1898  if (NeedsScalarIV)
1899    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1900}
1901
1902Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1903                                          Instruction::BinaryOps BinOp) {
1904  // Create and check the types.
1905  assert(Val->getType()->isVectorTy() && "Must be a vector");
1906  int VLen = Val->getType()->getVectorNumElements();
1907
1908  Type *STy = Val->getType()->getScalarType();
1909  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1910         "Induction Step must be an integer or FP");
1911  assert(Step->getType() == STy && "Step has wrong type");
1912
1913  SmallVector<Constant *, 8> Indices;
1914
1915  if (STy->isIntegerTy()) {
1916    // Create a vector of consecutive numbers from zero to VF.
1917    for (int i = 0; i < VLen; ++i)
1918      Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1919
1920    // Add the consecutive indices to the vector value.
1921    Constant *Cv = ConstantVector::get(Indices);
1922    assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1923    Step = Builder.CreateVectorSplat(VLen, Step);
1924    assert(Step->getType() == Val->getType() && "Invalid step vec");
1925    // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1926    // which can be found from the original scalar operations.
1927    Step = Builder.CreateMul(Cv, Step);
1928    return Builder.CreateAdd(Val, Step, "induction");
1929  }
1930
1931  // Floating point induction.
1932  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1933         "Binary Opcode should be specified for FP induction");
1934  // Create a vector of consecutive numbers from zero to VF.
1935  for (int i = 0; i < VLen; ++i)
1936    Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1937
1938  // Add the consecutive indices to the vector value.
1939  Constant *Cv = ConstantVector::get(Indices);
1940
1941  Step = Builder.CreateVectorSplat(VLen, Step);
1942
1943  // Floating point operations had to be 'fast' to enable the induction.
1944  FastMathFlags Flags;
1945  Flags.setFast();
1946
1947  Value *MulOp = Builder.CreateFMul(Cv, Step);
1948  if (isa<Instruction>(MulOp))
1949    // Have to check, MulOp may be a constant
1950    cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1951
1952  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1953  if (isa<Instruction>(BOp))
1954    cast<Instruction>(BOp)->setFastMathFlags(Flags);
1955  return BOp;
1956}
1957
1958void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1959                                           Instruction *EntryVal,
1960                                           const InductionDescriptor &ID) {
1961  // We shouldn't have to build scalar steps if we aren't vectorizing.
1962  assert(VF > 1 && "VF should be greater than one");
1963
1964  // Get the value type and ensure it and the step have the same integer type.
1965  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1966  assert(ScalarIVTy == Step->getType() &&
1967         "Val and Step should have the same type");
1968
1969  // We build scalar steps for both integer and floating-point induction
1970  // variables. Here, we determine the kind of arithmetic we will perform.
1971  Instruction::BinaryOps AddOp;
1972  Instruction::BinaryOps MulOp;
1973  if (ScalarIVTy->isIntegerTy()) {
1974    AddOp = Instruction::Add;
1975    MulOp = Instruction::Mul;
1976  } else {
1977    AddOp = ID.getInductionOpcode();
1978    MulOp = Instruction::FMul;
1979  }
1980
1981  // Determine the number of scalars we need to generate for each unroll
1982  // iteration. If EntryVal is uniform, we only need to generate the first
1983  // lane. Otherwise, we generate all VF values.
1984  unsigned Lanes =
1985      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1986                                                                         : VF;
1987  // Compute the scalar steps and save the results in VectorLoopValueMap.
1988  for (unsigned Part = 0; Part < UF; ++Part) {
1989    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1990      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1991      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1992      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1993      VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1994      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1995    }
1996  }
1997}
1998
1999Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2000  assert(V != Induction && "The new induction variable should not be used.");
2001  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2002  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2003
2004  // If we have a stride that is replaced by one, do it here. Defer this for
2005  // the VPlan-native path until we start running Legal checks in that path.
2006  if (!EnableVPlanNativePath && Legal->hasStride(V))
2007    V = ConstantInt::get(V->getType(), 1);
2008
2009  // If we have a vector mapped to this value, return it.
2010  if (VectorLoopValueMap.hasVectorValue(V, Part))
2011    return VectorLoopValueMap.getVectorValue(V, Part);
2012
2013  // If the value has not been vectorized, check if it has been scalarized
2014  // instead. If it has been scalarized, and we actually need the value in
2015  // vector form, we will construct the vector values on demand.
2016  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2017    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2018
2019    // If we've scalarized a value, that value should be an instruction.
2020    auto *I = cast<Instruction>(V);
2021
2022    // If we aren't vectorizing, we can just copy the scalar map values over to
2023    // the vector map.
2024    if (VF == 1) {
2025      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2026      return ScalarValue;
2027    }
2028
2029    // Get the last scalar instruction we generated for V and Part. If the value
2030    // is known to be uniform after vectorization, this corresponds to lane zero
2031    // of the Part unroll iteration. Otherwise, the last instruction is the one
2032    // we created for the last vector lane of the Part unroll iteration.
2033    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2034    auto *LastInst = cast<Instruction>(
2035        VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2036
2037    // Set the insert point after the last scalarized instruction. This ensures
2038    // the insertelement sequence will directly follow the scalar definitions.
2039    auto OldIP = Builder.saveIP();
2040    auto NewIP = std::next(BasicBlock::iterator(LastInst));
2041    Builder.SetInsertPoint(&*NewIP);
2042
2043    // However, if we are vectorizing, we need to construct the vector values.
2044    // If the value is known to be uniform after vectorization, we can just
2045    // broadcast the scalar value corresponding to lane zero for each unroll
2046    // iteration. Otherwise, we construct the vector values using insertelement
2047    // instructions. Since the resulting vectors are stored in
2048    // VectorLoopValueMap, we will only generate the insertelements once.
2049    Value *VectorValue = nullptr;
2050    if (Cost->isUniformAfterVectorization(I, VF)) {
2051      VectorValue = getBroadcastInstrs(ScalarValue);
2052      VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2053    } else {
2054      // Initialize packing with insertelements to start from undef.
2055      Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2056      VectorLoopValueMap.setVectorValue(V, Part, Undef);
2057      for (unsigned Lane = 0; Lane < VF; ++Lane)
2058        packScalarIntoVectorValue(V, {Part, Lane});
2059      VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2060    }
2061    Builder.restoreIP(OldIP);
2062    return VectorValue;
2063  }
2064
2065  // If this scalar is unknown, assume that it is a constant or that it is
2066  // loop invariant. Broadcast V and save the value for future uses.
2067  Value *B = getBroadcastInstrs(V);
2068  VectorLoopValueMap.setVectorValue(V, Part, B);
2069  return B;
2070}
2071
2072Value *
2073InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2074                                            const VPIteration &Instance) {
2075  // If the value is not an instruction contained in the loop, it should
2076  // already be scalar.
2077  if (OrigLoop->isLoopInvariant(V))
2078    return V;
2079
2080  assert(Instance.Lane > 0
2081             ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2082             : true && "Uniform values only have lane zero");
2083
2084  // If the value from the original loop has not been vectorized, it is
2085  // represented by UF x VF scalar values in the new loop. Return the requested
2086  // scalar value.
2087  if (VectorLoopValueMap.hasScalarValue(V, Instance))
2088    return VectorLoopValueMap.getScalarValue(V, Instance);
2089
2090  // If the value has not been scalarized, get its entry in VectorLoopValueMap
2091  // for the given unroll part. If this entry is not a vector type (i.e., the
2092  // vectorization factor is one), there is no need to generate an
2093  // extractelement instruction.
2094  auto *U = getOrCreateVectorValue(V, Instance.Part);
2095  if (!U->getType()->isVectorTy()) {
2096    assert(VF == 1 && "Value not scalarized has non-vector type");
2097    return U;
2098  }
2099
2100  // Otherwise, the value from the original loop has been vectorized and is
2101  // represented by UF vector values. Extract and return the requested scalar
2102  // value from the appropriate vector lane.
2103  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2104}
2105
2106void InnerLoopVectorizer::packScalarIntoVectorValue(
2107    Value *V, const VPIteration &Instance) {
2108  assert(V != Induction && "The new induction variable should not be used.");
2109  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2110  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2111
2112  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2113  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2114  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2115                                            Builder.getInt32(Instance.Lane));
2116  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2117}
2118
2119Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2120  assert(Vec->getType()->isVectorTy() && "Invalid type");
2121  SmallVector<Constant *, 8> ShuffleMask;
2122  for (unsigned i = 0; i < VF; ++i)
2123    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2124
2125  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2126                                     ConstantVector::get(ShuffleMask),
2127                                     "reverse");
2128}
2129
2130// Return whether we allow using masked interleave-groups (for dealing with
2131// strided loads/stores that reside in predicated blocks, or for dealing
2132// with gaps).
2133static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2134  // If an override option has been passed in for interleaved accesses, use it.
2135  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2136    return EnableMaskedInterleavedMemAccesses;
2137
2138  return TTI.enableMaskedInterleavedAccessVectorization();
2139}
2140
2141// Try to vectorize the interleave group that \p Instr belongs to.
2142//
2143// E.g. Translate following interleaved load group (factor = 3):
2144//   for (i = 0; i < N; i+=3) {
2145//     R = Pic[i];             // Member of index 0
2146//     G = Pic[i+1];           // Member of index 1
2147//     B = Pic[i+2];           // Member of index 2
2148//     ... // do something to R, G, B
2149//   }
2150// To:
2151//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2152//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2153//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2154//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2155//
2156// Or translate following interleaved store group (factor = 3):
2157//   for (i = 0; i < N; i+=3) {
2158//     ... do something to R, G, B
2159//     Pic[i]   = R;           // Member of index 0
2160//     Pic[i+1] = G;           // Member of index 1
2161//     Pic[i+2] = B;           // Member of index 2
2162//   }
2163// To:
2164//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2165//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2166//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2167//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2168//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2169void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2170                                                   VPTransformState &State,
2171                                                   VPValue *Addr,
2172                                                   VPValue *BlockInMask) {
2173  const InterleaveGroup<Instruction> *Group =
2174      Cost->getInterleavedAccessGroup(Instr);
2175  assert(Group && "Fail to get an interleaved access group.");
2176
2177  // Skip if current instruction is not the insert position.
2178  if (Instr != Group->getInsertPos())
2179    return;
2180
2181  const DataLayout &DL = Instr->getModule()->getDataLayout();
2182
2183  // Prepare for the vector type of the interleaved load/store.
2184  Type *ScalarTy = getMemInstValueType(Instr);
2185  unsigned InterleaveFactor = Group->getFactor();
2186  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2187
2188  // Prepare for the new pointers.
2189  SmallVector<Value *, 2> AddrParts;
2190  unsigned Index = Group->getIndex(Instr);
2191
2192  // TODO: extend the masked interleaved-group support to reversed access.
2193  assert((!BlockInMask || !Group->isReverse()) &&
2194         "Reversed masked interleave-group not supported.");
2195
2196  // If the group is reverse, adjust the index to refer to the last vector lane
2197  // instead of the first. We adjust the index from the first vector lane,
2198  // rather than directly getting the pointer for lane VF - 1, because the
2199  // pointer operand of the interleaved access is supposed to be uniform. For
2200  // uniform instructions, we're only required to generate a value for the
2201  // first vector lane in each unroll iteration.
2202  if (Group->isReverse())
2203    Index += (VF - 1) * Group->getFactor();
2204
2205  for (unsigned Part = 0; Part < UF; Part++) {
2206    Value *AddrPart = State.get(Addr, {Part, 0});
2207    setDebugLocFromInst(Builder, AddrPart);
2208
2209    // Notice current instruction could be any index. Need to adjust the address
2210    // to the member of index 0.
2211    //
2212    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2213    //       b = A[i];       // Member of index 0
2214    // Current pointer is pointed to A[i+1], adjust it to A[i].
2215    //
2216    // E.g.  A[i+1] = a;     // Member of index 1
2217    //       A[i]   = b;     // Member of index 0
2218    //       A[i+2] = c;     // Member of index 2 (Current instruction)
2219    // Current pointer is pointed to A[i+2], adjust it to A[i].
2220
2221    bool InBounds = false;
2222    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2223      InBounds = gep->isInBounds();
2224    AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2225    cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2226
2227    // Cast to the vector pointer type.
2228    unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2229    Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2230    AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2231  }
2232
2233  setDebugLocFromInst(Builder, Instr);
2234  Value *UndefVec = UndefValue::get(VecTy);
2235
2236  Value *MaskForGaps = nullptr;
2237  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2238    MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2239    assert(MaskForGaps && "Mask for Gaps is required but it is null");
2240  }
2241
2242  // Vectorize the interleaved load group.
2243  if (isa<LoadInst>(Instr)) {
2244    // For each unroll part, create a wide load for the group.
2245    SmallVector<Value *, 2> NewLoads;
2246    for (unsigned Part = 0; Part < UF; Part++) {
2247      Instruction *NewLoad;
2248      if (BlockInMask || MaskForGaps) {
2249        assert(useMaskedInterleavedAccesses(*TTI) &&
2250               "masked interleaved groups are not allowed.");
2251        Value *GroupMask = MaskForGaps;
2252        if (BlockInMask) {
2253          Value *BlockInMaskPart = State.get(BlockInMask, Part);
2254          auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2255          auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2256          Value *ShuffledMask = Builder.CreateShuffleVector(
2257              BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2258          GroupMask = MaskForGaps
2259                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2260                                                MaskForGaps)
2261                          : ShuffledMask;
2262        }
2263        NewLoad =
2264            Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
2265                                     GroupMask, UndefVec, "wide.masked.vec");
2266      }
2267      else
2268        NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2269                                            Group->getAlignment(), "wide.vec");
2270      Group->addMetadata(NewLoad);
2271      NewLoads.push_back(NewLoad);
2272    }
2273
2274    // For each member in the group, shuffle out the appropriate data from the
2275    // wide loads.
2276    for (unsigned I = 0; I < InterleaveFactor; ++I) {
2277      Instruction *Member = Group->getMember(I);
2278
2279      // Skip the gaps in the group.
2280      if (!Member)
2281        continue;
2282
2283      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2284      for (unsigned Part = 0; Part < UF; Part++) {
2285        Value *StridedVec = Builder.CreateShuffleVector(
2286            NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2287
2288        // If this member has different type, cast the result type.
2289        if (Member->getType() != ScalarTy) {
2290          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2291          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2292        }
2293
2294        if (Group->isReverse())
2295          StridedVec = reverseVector(StridedVec);
2296
2297        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2298      }
2299    }
2300    return;
2301  }
2302
2303  // The sub vector type for current instruction.
2304  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2305
2306  // Vectorize the interleaved store group.
2307  for (unsigned Part = 0; Part < UF; Part++) {
2308    // Collect the stored vector from each member.
2309    SmallVector<Value *, 4> StoredVecs;
2310    for (unsigned i = 0; i < InterleaveFactor; i++) {
2311      // Interleaved store group doesn't allow a gap, so each index has a member
2312      Instruction *Member = Group->getMember(i);
2313      assert(Member && "Fail to get a member from an interleaved store group");
2314
2315      Value *StoredVec = getOrCreateVectorValue(
2316          cast<StoreInst>(Member)->getValueOperand(), Part);
2317      if (Group->isReverse())
2318        StoredVec = reverseVector(StoredVec);
2319
2320      // If this member has different type, cast it to a unified type.
2321
2322      if (StoredVec->getType() != SubVT)
2323        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2324
2325      StoredVecs.push_back(StoredVec);
2326    }
2327
2328    // Concatenate all vectors into a wide vector.
2329    Value *WideVec = concatenateVectors(Builder, StoredVecs);
2330
2331    // Interleave the elements in the wide vector.
2332    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2333    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2334                                              "interleaved.vec");
2335
2336    Instruction *NewStoreInstr;
2337    if (BlockInMask) {
2338      Value *BlockInMaskPart = State.get(BlockInMask, Part);
2339      auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2340      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2341      Value *ShuffledMask = Builder.CreateShuffleVector(
2342          BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2343      NewStoreInstr = Builder.CreateMaskedStore(
2344          IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
2345    }
2346    else
2347      NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
2348                                                 Group->getAlignment());
2349
2350    Group->addMetadata(NewStoreInstr);
2351  }
2352}
2353
2354void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2355                                                     VPTransformState &State,
2356                                                     VPValue *Addr,
2357                                                     VPValue *BlockInMask) {
2358  // Attempt to issue a wide load.
2359  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2360  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2361
2362  assert((LI || SI) && "Invalid Load/Store instruction");
2363
2364  LoopVectorizationCostModel::InstWidening Decision =
2365      Cost->getWideningDecision(Instr, VF);
2366  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2367         "CM decision should be taken at this point");
2368  if (Decision == LoopVectorizationCostModel::CM_Interleave)
2369    return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2370
2371  Type *ScalarDataTy = getMemInstValueType(Instr);
2372  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2373  // An alignment of 0 means target abi alignment. We need to use the scalar's
2374  // target abi alignment in such a case.
2375  const DataLayout &DL = Instr->getModule()->getDataLayout();
2376  const Align Alignment =
2377      DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2378
2379  // Determine if the pointer operand of the access is either consecutive or
2380  // reverse consecutive.
2381  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2382  bool ConsecutiveStride =
2383      Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2384  bool CreateGatherScatter =
2385      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2386
2387  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2388  // gather/scatter. Otherwise Decision should have been to Scalarize.
2389  assert((ConsecutiveStride || CreateGatherScatter) &&
2390         "The instruction should be scalarized");
2391  (void)ConsecutiveStride;
2392
2393  VectorParts BlockInMaskParts(UF);
2394  bool isMaskRequired = BlockInMask;
2395  if (isMaskRequired)
2396    for (unsigned Part = 0; Part < UF; ++Part)
2397      BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2398
2399  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2400    // Calculate the pointer for the specific unroll-part.
2401    GetElementPtrInst *PartPtr = nullptr;
2402
2403    bool InBounds = false;
2404    if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2405      InBounds = gep->isInBounds();
2406
2407    if (Reverse) {
2408      // If the address is consecutive but reversed, then the
2409      // wide store needs to start at the last vector element.
2410      PartPtr = cast<GetElementPtrInst>(
2411          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2412      PartPtr->setIsInBounds(InBounds);
2413      PartPtr = cast<GetElementPtrInst>(
2414          Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2415      PartPtr->setIsInBounds(InBounds);
2416      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2417        BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2418    } else {
2419      PartPtr = cast<GetElementPtrInst>(
2420          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2421      PartPtr->setIsInBounds(InBounds);
2422    }
2423
2424    unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2425    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2426  };
2427
2428  // Handle Stores:
2429  if (SI) {
2430    setDebugLocFromInst(Builder, SI);
2431
2432    for (unsigned Part = 0; Part < UF; ++Part) {
2433      Instruction *NewSI = nullptr;
2434      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2435      if (CreateGatherScatter) {
2436        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2437        Value *VectorGep = State.get(Addr, Part);
2438        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2439                                            Alignment.value(), MaskPart);
2440      } else {
2441        if (Reverse) {
2442          // If we store to reverse consecutive memory locations, then we need
2443          // to reverse the order of elements in the stored value.
2444          StoredVal = reverseVector(StoredVal);
2445          // We don't want to update the value in the map as it might be used in
2446          // another expression. So don't call resetVectorValue(StoredVal).
2447        }
2448        auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2449        if (isMaskRequired)
2450          NewSI = Builder.CreateMaskedStore(
2451              StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
2452        else
2453          NewSI =
2454              Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2455      }
2456      addMetadata(NewSI, SI);
2457    }
2458    return;
2459  }
2460
2461  // Handle loads.
2462  assert(LI && "Must have a load instruction");
2463  setDebugLocFromInst(Builder, LI);
2464  for (unsigned Part = 0; Part < UF; ++Part) {
2465    Value *NewLI;
2466    if (CreateGatherScatter) {
2467      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2468      Value *VectorGep = State.get(Addr, Part);
2469      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2470                                         nullptr, "wide.masked.gather");
2471      addMetadata(NewLI, LI);
2472    } else {
2473      auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2474      if (isMaskRequired)
2475        NewLI = Builder.CreateMaskedLoad(
2476            VecPtr, Alignment.value(), BlockInMaskParts[Part],
2477            UndefValue::get(DataTy), "wide.masked.load");
2478      else
2479        NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2480                                          "wide.load");
2481
2482      // Add metadata to the load, but setVectorValue to the reverse shuffle.
2483      addMetadata(NewLI, LI);
2484      if (Reverse)
2485        NewLI = reverseVector(NewLI);
2486    }
2487    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2488  }
2489}
2490
2491void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2492                                               const VPIteration &Instance,
2493                                               bool IfPredicateInstr) {
2494  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2495
2496  setDebugLocFromInst(Builder, Instr);
2497
2498  // Does this instruction return a value ?
2499  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2500
2501  Instruction *Cloned = Instr->clone();
2502  if (!IsVoidRetTy)
2503    Cloned->setName(Instr->getName() + ".cloned");
2504
2505  // Replace the operands of the cloned instructions with their scalar
2506  // equivalents in the new loop.
2507  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2508    auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2509    Cloned->setOperand(op, NewOp);
2510  }
2511  addNewMetadata(Cloned, Instr);
2512
2513  // Place the cloned scalar in the new loop.
2514  Builder.Insert(Cloned);
2515
2516  // Add the cloned scalar to the scalar map entry.
2517  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2518
2519  // If we just cloned a new assumption, add it the assumption cache.
2520  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2521    if (II->getIntrinsicID() == Intrinsic::assume)
2522      AC->registerAssumption(II);
2523
2524  // End if-block.
2525  if (IfPredicateInstr)
2526    PredicatedInstructions.push_back(Cloned);
2527}
2528
2529PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2530                                                      Value *End, Value *Step,
2531                                                      Instruction *DL) {
2532  BasicBlock *Header = L->getHeader();
2533  BasicBlock *Latch = L->getLoopLatch();
2534  // As we're just creating this loop, it's possible no latch exists
2535  // yet. If so, use the header as this will be a single block loop.
2536  if (!Latch)
2537    Latch = Header;
2538
2539  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2540  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2541  setDebugLocFromInst(Builder, OldInst);
2542  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2543
2544  Builder.SetInsertPoint(Latch->getTerminator());
2545  setDebugLocFromInst(Builder, OldInst);
2546
2547  // Create i+1 and fill the PHINode.
2548  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2549  Induction->addIncoming(Start, L->getLoopPreheader());
2550  Induction->addIncoming(Next, Latch);
2551  // Create the compare.
2552  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2553  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2554
2555  // Now we have two terminators. Remove the old one from the block.
2556  Latch->getTerminator()->eraseFromParent();
2557
2558  return Induction;
2559}
2560
2561Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2562  if (TripCount)
2563    return TripCount;
2564
2565  assert(L && "Create Trip Count for null loop.");
2566  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2567  // Find the loop boundaries.
2568  ScalarEvolution *SE = PSE.getSE();
2569  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2570  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2571         "Invalid loop count");
2572
2573  Type *IdxTy = Legal->getWidestInductionType();
2574  assert(IdxTy && "No type for induction");
2575
2576  // The exit count might have the type of i64 while the phi is i32. This can
2577  // happen if we have an induction variable that is sign extended before the
2578  // compare. The only way that we get a backedge taken count is that the
2579  // induction variable was signed and as such will not overflow. In such a case
2580  // truncation is legal.
2581  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2582      IdxTy->getPrimitiveSizeInBits())
2583    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2584  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2585
2586  // Get the total trip count from the count by adding 1.
2587  const SCEV *ExitCount = SE->getAddExpr(
2588      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2589
2590  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2591
2592  // Expand the trip count and place the new instructions in the preheader.
2593  // Notice that the pre-header does not change, only the loop body.
2594  SCEVExpander Exp(*SE, DL, "induction");
2595
2596  // Count holds the overall loop count (N).
2597  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2598                                L->getLoopPreheader()->getTerminator());
2599
2600  if (TripCount->getType()->isPointerTy())
2601    TripCount =
2602        CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2603                                    L->getLoopPreheader()->getTerminator());
2604
2605  return TripCount;
2606}
2607
2608Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2609  if (VectorTripCount)
2610    return VectorTripCount;
2611
2612  Value *TC = getOrCreateTripCount(L);
2613  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2614
2615  Type *Ty = TC->getType();
2616  Constant *Step = ConstantInt::get(Ty, VF * UF);
2617
2618  // If the tail is to be folded by masking, round the number of iterations N
2619  // up to a multiple of Step instead of rounding down. This is done by first
2620  // adding Step-1 and then rounding down. Note that it's ok if this addition
2621  // overflows: the vector induction variable will eventually wrap to zero given
2622  // that it starts at zero and its Step is a power of two; the loop will then
2623  // exit, with the last early-exit vector comparison also producing all-true.
2624  if (Cost->foldTailByMasking()) {
2625    assert(isPowerOf2_32(VF * UF) &&
2626           "VF*UF must be a power of 2 when folding tail by masking");
2627    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2628  }
2629
2630  // Now we need to generate the expression for the part of the loop that the
2631  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2632  // iterations are not required for correctness, or N - Step, otherwise. Step
2633  // is equal to the vectorization factor (number of SIMD elements) times the
2634  // unroll factor (number of SIMD instructions).
2635  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2636
2637  // If there is a non-reversed interleaved group that may speculatively access
2638  // memory out-of-bounds, we need to ensure that there will be at least one
2639  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2640  // the trip count, we set the remainder to be equal to the step. If the step
2641  // does not evenly divide the trip count, no adjustment is necessary since
2642  // there will already be scalar iterations. Note that the minimum iterations
2643  // check ensures that N >= Step.
2644  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2645    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2646    R = Builder.CreateSelect(IsZero, Step, R);
2647  }
2648
2649  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2650
2651  return VectorTripCount;
2652}
2653
2654Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2655                                                   const DataLayout &DL) {
2656  // Verify that V is a vector type with same number of elements as DstVTy.
2657  unsigned VF = DstVTy->getNumElements();
2658  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2659  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2660  Type *SrcElemTy = SrcVecTy->getElementType();
2661  Type *DstElemTy = DstVTy->getElementType();
2662  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2663         "Vector elements must have same size");
2664
2665  // Do a direct cast if element types are castable.
2666  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2667    return Builder.CreateBitOrPointerCast(V, DstVTy);
2668  }
2669  // V cannot be directly casted to desired vector type.
2670  // May happen when V is a floating point vector but DstVTy is a vector of
2671  // pointers or vice-versa. Handle this using a two-step bitcast using an
2672  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2673  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2674         "Only one type should be a pointer type");
2675  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2676         "Only one type should be a floating point type");
2677  Type *IntTy =
2678      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2679  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2680  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2681  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2682}
2683
2684void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2685                                                         BasicBlock *Bypass) {
2686  Value *Count = getOrCreateTripCount(L);
2687  // Reuse existing vector loop preheader for TC checks.
2688  // Note that new preheader block is generated for vector loop.
2689  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2690  IRBuilder<> Builder(TCCheckBlock->getTerminator());
2691
2692  // Generate code to check if the loop's trip count is less than VF * UF, or
2693  // equal to it in case a scalar epilogue is required; this implies that the
2694  // vector trip count is zero. This check also covers the case where adding one
2695  // to the backedge-taken count overflowed leading to an incorrect trip count
2696  // of zero. In this case we will also jump to the scalar loop.
2697  auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2698                                          : ICmpInst::ICMP_ULT;
2699
2700  // If tail is to be folded, vector loop takes care of all iterations.
2701  Value *CheckMinIters = Builder.getFalse();
2702  if (!Cost->foldTailByMasking())
2703    CheckMinIters = Builder.CreateICmp(
2704        P, Count, ConstantInt::get(Count->getType(), VF * UF),
2705        "min.iters.check");
2706
2707  // Create new preheader for vector loop.
2708  LoopVectorPreHeader =
2709      SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2710                 "vector.ph");
2711
2712  assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2713                               DT->getNode(Bypass)->getIDom()) &&
2714         "TC check is expected to dominate Bypass");
2715
2716  // Update dominator for Bypass & LoopExit.
2717  DT->changeImmediateDominator(Bypass, TCCheckBlock);
2718  DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2719
2720  ReplaceInstWithInst(
2721      TCCheckBlock->getTerminator(),
2722      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2723  LoopBypassBlocks.push_back(TCCheckBlock);
2724}
2725
2726void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2727  // Reuse existing vector loop preheader for SCEV checks.
2728  // Note that new preheader block is generated for vector loop.
2729  BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2730
2731  // Generate the code to check that the SCEV assumptions that we made.
2732  // We want the new basic block to start at the first instruction in a
2733  // sequence of instructions that form a check.
2734  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2735                   "scev.check");
2736  Value *SCEVCheck = Exp.expandCodeForPredicate(
2737      &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2738
2739  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2740    if (C->isZero())
2741      return;
2742
2743  assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2744         "Cannot SCEV check stride or overflow when optimizing for size");
2745
2746  SCEVCheckBlock->setName("vector.scevcheck");
2747  // Create new preheader for vector loop.
2748  LoopVectorPreHeader =
2749      SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2750                 nullptr, "vector.ph");
2751
2752  // Update dominator only if this is first RT check.
2753  if (LoopBypassBlocks.empty()) {
2754    DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2755    DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2756  }
2757
2758  ReplaceInstWithInst(
2759      SCEVCheckBlock->getTerminator(),
2760      BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2761  LoopBypassBlocks.push_back(SCEVCheckBlock);
2762  AddedSafetyChecks = true;
2763}
2764
2765void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2766  // VPlan-native path does not do any analysis for runtime checks currently.
2767  if (EnableVPlanNativePath)
2768    return;
2769
2770  // Reuse existing vector loop preheader for runtime memory checks.
2771  // Note that new preheader block is generated for vector loop.
2772  BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2773
2774  // Generate the code that checks in runtime if arrays overlap. We put the
2775  // checks into a separate block to make the more common case of few elements
2776  // faster.
2777  Instruction *FirstCheckInst;
2778  Instruction *MemRuntimeCheck;
2779  std::tie(FirstCheckInst, MemRuntimeCheck) =
2780      Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2781  if (!MemRuntimeCheck)
2782    return;
2783
2784  if (MemCheckBlock->getParent()->hasOptSize()) {
2785    assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2786           "Cannot emit memory checks when optimizing for size, unless forced "
2787           "to vectorize.");
2788    ORE->emit([&]() {
2789      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2790                                        L->getStartLoc(), L->getHeader())
2791             << "Code-size may be reduced by not forcing "
2792                "vectorization, or by source-code modifications "
2793                "eliminating the need for runtime checks "
2794                "(e.g., adding 'restrict').";
2795    });
2796  }
2797
2798  MemCheckBlock->setName("vector.memcheck");
2799  // Create new preheader for vector loop.
2800  LoopVectorPreHeader =
2801      SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2802                 "vector.ph");
2803
2804  // Update dominator only if this is first RT check.
2805  if (LoopBypassBlocks.empty()) {
2806    DT->changeImmediateDominator(Bypass, MemCheckBlock);
2807    DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2808  }
2809
2810  ReplaceInstWithInst(
2811      MemCheckBlock->getTerminator(),
2812      BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2813  LoopBypassBlocks.push_back(MemCheckBlock);
2814  AddedSafetyChecks = true;
2815
2816  // We currently don't use LoopVersioning for the actual loop cloning but we
2817  // still use it to add the noalias metadata.
2818  LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2819                                          PSE.getSE());
2820  LVer->prepareNoAliasMetadata();
2821}
2822
2823Value *InnerLoopVectorizer::emitTransformedIndex(
2824    IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2825    const InductionDescriptor &ID) const {
2826
2827  SCEVExpander Exp(*SE, DL, "induction");
2828  auto Step = ID.getStep();
2829  auto StartValue = ID.getStartValue();
2830  assert(Index->getType() == Step->getType() &&
2831         "Index type does not match StepValue type");
2832
2833  // Note: the IR at this point is broken. We cannot use SE to create any new
2834  // SCEV and then expand it, hoping that SCEV's simplification will give us
2835  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2836  // lead to various SCEV crashes. So all we can do is to use builder and rely
2837  // on InstCombine for future simplifications. Here we handle some trivial
2838  // cases only.
2839  auto CreateAdd = [&B](Value *X, Value *Y) {
2840    assert(X->getType() == Y->getType() && "Types don't match!");
2841    if (auto *CX = dyn_cast<ConstantInt>(X))
2842      if (CX->isZero())
2843        return Y;
2844    if (auto *CY = dyn_cast<ConstantInt>(Y))
2845      if (CY->isZero())
2846        return X;
2847    return B.CreateAdd(X, Y);
2848  };
2849
2850  auto CreateMul = [&B](Value *X, Value *Y) {
2851    assert(X->getType() == Y->getType() && "Types don't match!");
2852    if (auto *CX = dyn_cast<ConstantInt>(X))
2853      if (CX->isOne())
2854        return Y;
2855    if (auto *CY = dyn_cast<ConstantInt>(Y))
2856      if (CY->isOne())
2857        return X;
2858    return B.CreateMul(X, Y);
2859  };
2860
2861  switch (ID.getKind()) {
2862  case InductionDescriptor::IK_IntInduction: {
2863    assert(Index->getType() == StartValue->getType() &&
2864           "Index type does not match StartValue type");
2865    if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2866      return B.CreateSub(StartValue, Index);
2867    auto *Offset = CreateMul(
2868        Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2869    return CreateAdd(StartValue, Offset);
2870  }
2871  case InductionDescriptor::IK_PtrInduction: {
2872    assert(isa<SCEVConstant>(Step) &&
2873           "Expected constant step for pointer induction");
2874    return B.CreateGEP(
2875        StartValue->getType()->getPointerElementType(), StartValue,
2876        CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2877                                           &*B.GetInsertPoint())));
2878  }
2879  case InductionDescriptor::IK_FpInduction: {
2880    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2881    auto InductionBinOp = ID.getInductionBinOp();
2882    assert(InductionBinOp &&
2883           (InductionBinOp->getOpcode() == Instruction::FAdd ||
2884            InductionBinOp->getOpcode() == Instruction::FSub) &&
2885           "Original bin op should be defined for FP induction");
2886
2887    Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2888
2889    // Floating point operations had to be 'fast' to enable the induction.
2890    FastMathFlags Flags;
2891    Flags.setFast();
2892
2893    Value *MulExp = B.CreateFMul(StepValue, Index);
2894    if (isa<Instruction>(MulExp))
2895      // We have to check, the MulExp may be a constant.
2896      cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2897
2898    Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2899                               "induction");
2900    if (isa<Instruction>(BOp))
2901      cast<Instruction>(BOp)->setFastMathFlags(Flags);
2902
2903    return BOp;
2904  }
2905  case InductionDescriptor::IK_NoInduction:
2906    return nullptr;
2907  }
2908  llvm_unreachable("invalid enum");
2909}
2910
2911BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2912  /*
2913   In this function we generate a new loop. The new loop will contain
2914   the vectorized instructions while the old loop will continue to run the
2915   scalar remainder.
2916
2917       [ ] <-- loop iteration number check.
2918    /   |
2919   /    v
2920  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2921  |  /  |
2922  | /   v
2923  ||   [ ]     <-- vector pre header.
2924  |/    |
2925  |     v
2926  |    [  ] \
2927  |    [  ]_|   <-- vector loop.
2928  |     |
2929  |     v
2930  |   -[ ]   <--- middle-block.
2931  |  /  |
2932  | /   v
2933  -|- >[ ]     <--- new preheader.
2934   |    |
2935   |    v
2936   |   [ ] \
2937   |   [ ]_|   <-- old scalar loop to handle remainder.
2938    \   |
2939     \  v
2940      >[ ]     <-- exit block.
2941   ...
2942   */
2943
2944  MDNode *OrigLoopID = OrigLoop->getLoopID();
2945
2946  // Some loops have a single integer induction variable, while other loops
2947  // don't. One example is c++ iterators that often have multiple pointer
2948  // induction variables. In the code below we also support a case where we
2949  // don't have a single induction variable.
2950  //
2951  // We try to obtain an induction variable from the original loop as hard
2952  // as possible. However if we don't find one that:
2953  //   - is an integer
2954  //   - counts from zero, stepping by one
2955  //   - is the size of the widest induction variable type
2956  // then we create a new one.
2957  OldInduction = Legal->getPrimaryInduction();
2958  Type *IdxTy = Legal->getWidestInductionType();
2959
2960  // Split the single block loop into the two loop structure described above.
2961  LoopScalarBody = OrigLoop->getHeader();
2962  LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2963  LoopExitBlock = OrigLoop->getExitBlock();
2964  assert(LoopExitBlock && "Must have an exit block");
2965  assert(LoopVectorPreHeader && "Invalid loop structure");
2966
2967  LoopMiddleBlock =
2968      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2969                 LI, nullptr, "middle.block");
2970  LoopScalarPreHeader =
2971      SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2972                 nullptr, "scalar.ph");
2973  // We intentionally don't let SplitBlock to update LoopInfo since
2974  // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2975  // LoopVectorBody is explicitly added to the correct place few lines later.
2976  LoopVectorBody =
2977      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2978                 nullptr, nullptr, "vector.body");
2979
2980  // Update dominator for loop exit.
2981  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2982
2983  // Create and register the new vector loop.
2984  Loop *Lp = LI->AllocateLoop();
2985  Loop *ParentLoop = OrigLoop->getParentLoop();
2986
2987  // Insert the new loop into the loop nest and register the new basic blocks
2988  // before calling any utilities such as SCEV that require valid LoopInfo.
2989  if (ParentLoop) {
2990    ParentLoop->addChildLoop(Lp);
2991  } else {
2992    LI->addTopLevelLoop(Lp);
2993  }
2994  Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2995
2996  // Find the loop boundaries.
2997  Value *Count = getOrCreateTripCount(Lp);
2998
2999  Value *StartIdx = ConstantInt::get(IdxTy, 0);
3000
3001  // Now, compare the new count to zero. If it is zero skip the vector loop and
3002  // jump to the scalar loop. This check also covers the case where the
3003  // backedge-taken count is uint##_max: adding one to it will overflow leading
3004  // to an incorrect trip count of zero. In this (rare) case we will also jump
3005  // to the scalar loop.
3006  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3007
3008  // Generate the code to check any assumptions that we've made for SCEV
3009  // expressions.
3010  emitSCEVChecks(Lp, LoopScalarPreHeader);
3011
3012  // Generate the code that checks in runtime if arrays overlap. We put the
3013  // checks into a separate block to make the more common case of few elements
3014  // faster.
3015  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3016
3017  // Generate the induction variable.
3018  // The loop step is equal to the vectorization factor (num of SIMD elements)
3019  // times the unroll factor (num of SIMD instructions).
3020  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3021  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3022  Induction =
3023      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3024                              getDebugLocFromInstOrOperands(OldInduction));
3025
3026  // We are going to resume the execution of the scalar loop.
3027  // Go over all of the induction variables that we found and fix the
3028  // PHIs that are left in the scalar version of the loop.
3029  // The starting values of PHI nodes depend on the counter of the last
3030  // iteration in the vectorized loop.
3031  // If we come from a bypass edge then we need to start from the original
3032  // start value.
3033
3034  // This variable saves the new starting index for the scalar loop. It is used
3035  // to test if there are any tail iterations left once the vector loop has
3036  // completed.
3037  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3038  for (auto &InductionEntry : *List) {
3039    PHINode *OrigPhi = InductionEntry.first;
3040    InductionDescriptor II = InductionEntry.second;
3041
3042    // Create phi nodes to merge from the  backedge-taken check block.
3043    PHINode *BCResumeVal =
3044        PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3045                        LoopScalarPreHeader->getTerminator());
3046    // Copy original phi DL over to the new one.
3047    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3048    Value *&EndValue = IVEndValues[OrigPhi];
3049    if (OrigPhi == OldInduction) {
3050      // We know what the end value is.
3051      EndValue = CountRoundDown;
3052    } else {
3053      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3054      Type *StepType = II.getStep()->getType();
3055      Instruction::CastOps CastOp =
3056          CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3057      Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3058      const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3059      EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3060      EndValue->setName("ind.end");
3061    }
3062
3063    // The new PHI merges the original incoming value, in case of a bypass,
3064    // or the value at the end of the vectorized loop.
3065    BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3066
3067    // Fix the scalar body counter (PHI node).
3068    // The old induction's phi node in the scalar body needs the truncated
3069    // value.
3070    for (BasicBlock *BB : LoopBypassBlocks)
3071      BCResumeVal->addIncoming(II.getStartValue(), BB);
3072    OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3073  }
3074
3075  // We need the OrigLoop (scalar loop part) latch terminator to help
3076  // produce correct debug info for the middle block BB instructions.
3077  // The legality check stage guarantees that the loop will have a single
3078  // latch.
3079  assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3080         "Scalar loop latch terminator isn't a branch");
3081  BranchInst *ScalarLatchBr =
3082      cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3083
3084  // Add a check in the middle block to see if we have completed
3085  // all of the iterations in the first vector loop.
3086  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3087  // If tail is to be folded, we know we don't need to run the remainder.
3088  Value *CmpN = Builder.getTrue();
3089  if (!Cost->foldTailByMasking()) {
3090    CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3091                           CountRoundDown, "cmp.n",
3092                           LoopMiddleBlock->getTerminator());
3093
3094    // Here we use the same DebugLoc as the scalar loop latch branch instead
3095    // of the corresponding compare because they may have ended up with
3096    // different line numbers and we want to avoid awkward line stepping while
3097    // debugging. Eg. if the compare has got a line number inside the loop.
3098    cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3099  }
3100
3101  BranchInst *BrInst =
3102      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3103  BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3104  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3105
3106  // Get ready to start creating new instructions into the vectorized body.
3107  assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3108         "Inconsistent vector loop preheader");
3109  Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3110
3111  Optional<MDNode *> VectorizedLoopID =
3112      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3113                                      LLVMLoopVectorizeFollowupVectorized});
3114  if (VectorizedLoopID.hasValue()) {
3115    Lp->setLoopID(VectorizedLoopID.getValue());
3116
3117    // Do not setAlreadyVectorized if loop attributes have been defined
3118    // explicitly.
3119    return LoopVectorPreHeader;
3120  }
3121
3122  // Keep all loop hints from the original loop on the vector loop (we'll
3123  // replace the vectorizer-specific hints below).
3124  if (MDNode *LID = OrigLoop->getLoopID())
3125    Lp->setLoopID(LID);
3126
3127  LoopVectorizeHints Hints(Lp, true, *ORE);
3128  Hints.setAlreadyVectorized();
3129
3130#ifdef EXPENSIVE_CHECKS
3131  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3132  LI->verify(*DT);
3133#endif
3134
3135  return LoopVectorPreHeader;
3136}
3137
3138// Fix up external users of the induction variable. At this point, we are
3139// in LCSSA form, with all external PHIs that use the IV having one input value,
3140// coming from the remainder loop. We need those PHIs to also have a correct
3141// value for the IV when arriving directly from the middle block.
3142void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3143                                       const InductionDescriptor &II,
3144                                       Value *CountRoundDown, Value *EndValue,
3145                                       BasicBlock *MiddleBlock) {
3146  // There are two kinds of external IV usages - those that use the value
3147  // computed in the last iteration (the PHI) and those that use the penultimate
3148  // value (the value that feeds into the phi from the loop latch).
3149  // We allow both, but they, obviously, have different values.
3150
3151  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3152
3153  DenseMap<Value *, Value *> MissingVals;
3154
3155  // An external user of the last iteration's value should see the value that
3156  // the remainder loop uses to initialize its own IV.
3157  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3158  for (User *U : PostInc->users()) {
3159    Instruction *UI = cast<Instruction>(U);
3160    if (!OrigLoop->contains(UI)) {
3161      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3162      MissingVals[UI] = EndValue;
3163    }
3164  }
3165
3166  // An external user of the penultimate value need to see EndValue - Step.
3167  // The simplest way to get this is to recompute it from the constituent SCEVs,
3168  // that is Start + (Step * (CRD - 1)).
3169  for (User *U : OrigPhi->users()) {
3170    auto *UI = cast<Instruction>(U);
3171    if (!OrigLoop->contains(UI)) {
3172      const DataLayout &DL =
3173          OrigLoop->getHeader()->getModule()->getDataLayout();
3174      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3175
3176      IRBuilder<> B(MiddleBlock->getTerminator());
3177      Value *CountMinusOne = B.CreateSub(
3178          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3179      Value *CMO =
3180          !II.getStep()->getType()->isIntegerTy()
3181              ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3182                             II.getStep()->getType())
3183              : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3184      CMO->setName("cast.cmo");
3185      Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3186      Escape->setName("ind.escape");
3187      MissingVals[UI] = Escape;
3188    }
3189  }
3190
3191  for (auto &I : MissingVals) {
3192    PHINode *PHI = cast<PHINode>(I.first);
3193    // One corner case we have to handle is two IVs "chasing" each-other,
3194    // that is %IV2 = phi [...], [ %IV1, %latch ]
3195    // In this case, if IV1 has an external use, we need to avoid adding both
3196    // "last value of IV1" and "penultimate value of IV2". So, verify that we
3197    // don't already have an incoming value for the middle block.
3198    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3199      PHI->addIncoming(I.second, MiddleBlock);
3200  }
3201}
3202
3203namespace {
3204
3205struct CSEDenseMapInfo {
3206  static bool canHandle(const Instruction *I) {
3207    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3208           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3209  }
3210
3211  static inline Instruction *getEmptyKey() {
3212    return DenseMapInfo<Instruction *>::getEmptyKey();
3213  }
3214
3215  static inline Instruction *getTombstoneKey() {
3216    return DenseMapInfo<Instruction *>::getTombstoneKey();
3217  }
3218
3219  static unsigned getHashValue(const Instruction *I) {
3220    assert(canHandle(I) && "Unknown instruction!");
3221    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3222                                                           I->value_op_end()));
3223  }
3224
3225  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3226    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3227        LHS == getTombstoneKey() || RHS == getTombstoneKey())
3228      return LHS == RHS;
3229    return LHS->isIdenticalTo(RHS);
3230  }
3231};
3232
3233} // end anonymous namespace
3234
3235///Perform cse of induction variable instructions.
3236static void cse(BasicBlock *BB) {
3237  // Perform simple cse.
3238  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3239  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3240    Instruction *In = &*I++;
3241
3242    if (!CSEDenseMapInfo::canHandle(In))
3243      continue;
3244
3245    // Check if we can replace this instruction with any of the
3246    // visited instructions.
3247    if (Instruction *V = CSEMap.lookup(In)) {
3248      In->replaceAllUsesWith(V);
3249      In->eraseFromParent();
3250      continue;
3251    }
3252
3253    CSEMap[In] = In;
3254  }
3255}
3256
3257unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3258                                                       unsigned VF,
3259                                                       bool &NeedToScalarize) {
3260  Function *F = CI->getCalledFunction();
3261  StringRef FnName = CI->getCalledFunction()->getName();
3262  Type *ScalarRetTy = CI->getType();
3263  SmallVector<Type *, 4> Tys, ScalarTys;
3264  for (auto &ArgOp : CI->arg_operands())
3265    ScalarTys.push_back(ArgOp->getType());
3266
3267  // Estimate cost of scalarized vector call. The source operands are assumed
3268  // to be vectors, so we need to extract individual elements from there,
3269  // execute VF scalar calls, and then gather the result into the vector return
3270  // value.
3271  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3272  if (VF == 1)
3273    return ScalarCallCost;
3274
3275  // Compute corresponding vector type for return value and arguments.
3276  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3277  for (Type *ScalarTy : ScalarTys)
3278    Tys.push_back(ToVectorTy(ScalarTy, VF));
3279
3280  // Compute costs of unpacking argument values for the scalar calls and
3281  // packing the return values to a vector.
3282  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3283
3284  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3285
3286  // If we can't emit a vector call for this function, then the currently found
3287  // cost is the cost we need to return.
3288  NeedToScalarize = true;
3289  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3290    return Cost;
3291
3292  // If the corresponding vector cost is cheaper, return its cost.
3293  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3294  if (VectorCallCost < Cost) {
3295    NeedToScalarize = false;
3296    return VectorCallCost;
3297  }
3298  return Cost;
3299}
3300
3301unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3302                                                            unsigned VF) {
3303  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3304  assert(ID && "Expected intrinsic call!");
3305
3306  FastMathFlags FMF;
3307  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3308    FMF = FPMO->getFastMathFlags();
3309
3310  SmallVector<Value *, 4> Operands(CI->arg_operands());
3311  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3312}
3313
3314static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3315  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3316  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3317  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3318}
3319static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3320  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3321  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3322  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3323}
3324
3325void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3326  // For every instruction `I` in MinBWs, truncate the operands, create a
3327  // truncated version of `I` and reextend its result. InstCombine runs
3328  // later and will remove any ext/trunc pairs.
3329  SmallPtrSet<Value *, 4> Erased;
3330  for (const auto &KV : Cost->getMinimalBitwidths()) {
3331    // If the value wasn't vectorized, we must maintain the original scalar
3332    // type. The absence of the value from VectorLoopValueMap indicates that it
3333    // wasn't vectorized.
3334    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3335      continue;
3336    for (unsigned Part = 0; Part < UF; ++Part) {
3337      Value *I = getOrCreateVectorValue(KV.first, Part);
3338      if (Erased.find(I) != Erased.end() || I->use_empty() ||
3339          !isa<Instruction>(I))
3340        continue;
3341      Type *OriginalTy = I->getType();
3342      Type *ScalarTruncatedTy =
3343          IntegerType::get(OriginalTy->getContext(), KV.second);
3344      Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3345                                          OriginalTy->getVectorNumElements());
3346      if (TruncatedTy == OriginalTy)
3347        continue;
3348
3349      IRBuilder<> B(cast<Instruction>(I));
3350      auto ShrinkOperand = [&](Value *V) -> Value * {
3351        if (auto *ZI = dyn_cast<ZExtInst>(V))
3352          if (ZI->getSrcTy() == TruncatedTy)
3353            return ZI->getOperand(0);
3354        return B.CreateZExtOrTrunc(V, TruncatedTy);
3355      };
3356
3357      // The actual instruction modification depends on the instruction type,
3358      // unfortunately.
3359      Value *NewI = nullptr;
3360      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3361        NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3362                             ShrinkOperand(BO->getOperand(1)));
3363
3364        // Any wrapping introduced by shrinking this operation shouldn't be
3365        // considered undefined behavior. So, we can't unconditionally copy
3366        // arithmetic wrapping flags to NewI.
3367        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3368      } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3369        NewI =
3370            B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3371                         ShrinkOperand(CI->getOperand(1)));
3372      } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3373        NewI = B.CreateSelect(SI->getCondition(),
3374                              ShrinkOperand(SI->getTrueValue()),
3375                              ShrinkOperand(SI->getFalseValue()));
3376      } else if (auto *CI = dyn_cast<CastInst>(I)) {
3377        switch (CI->getOpcode()) {
3378        default:
3379          llvm_unreachable("Unhandled cast!");
3380        case Instruction::Trunc:
3381          NewI = ShrinkOperand(CI->getOperand(0));
3382          break;
3383        case Instruction::SExt:
3384          NewI = B.CreateSExtOrTrunc(
3385              CI->getOperand(0),
3386              smallestIntegerVectorType(OriginalTy, TruncatedTy));
3387          break;
3388        case Instruction::ZExt:
3389          NewI = B.CreateZExtOrTrunc(
3390              CI->getOperand(0),
3391              smallestIntegerVectorType(OriginalTy, TruncatedTy));
3392          break;
3393        }
3394      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3395        auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3396        auto *O0 = B.CreateZExtOrTrunc(
3397            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3398        auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3399        auto *O1 = B.CreateZExtOrTrunc(
3400            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3401
3402        NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3403      } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3404        // Don't do anything with the operands, just extend the result.
3405        continue;
3406      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3407        auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3408        auto *O0 = B.CreateZExtOrTrunc(
3409            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3410        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3411        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3412      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3413        auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3414        auto *O0 = B.CreateZExtOrTrunc(
3415            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3416        NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3417      } else {
3418        // If we don't know what to do, be conservative and don't do anything.
3419        continue;
3420      }
3421
3422      // Lastly, extend the result.
3423      NewI->takeName(cast<Instruction>(I));
3424      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3425      I->replaceAllUsesWith(Res);
3426      cast<Instruction>(I)->eraseFromParent();
3427      Erased.insert(I);
3428      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3429    }
3430  }
3431
3432  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3433  for (const auto &KV : Cost->getMinimalBitwidths()) {
3434    // If the value wasn't vectorized, we must maintain the original scalar
3435    // type. The absence of the value from VectorLoopValueMap indicates that it
3436    // wasn't vectorized.
3437    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3438      continue;
3439    for (unsigned Part = 0; Part < UF; ++Part) {
3440      Value *I = getOrCreateVectorValue(KV.first, Part);
3441      ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3442      if (Inst && Inst->use_empty()) {
3443        Value *NewI = Inst->getOperand(0);
3444        Inst->eraseFromParent();
3445        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3446      }
3447    }
3448  }
3449}
3450
3451void InnerLoopVectorizer::fixVectorizedLoop() {
3452  // Insert truncates and extends for any truncated instructions as hints to
3453  // InstCombine.
3454  if (VF > 1)
3455    truncateToMinimalBitwidths();
3456
3457  // Fix widened non-induction PHIs by setting up the PHI operands.
3458  if (OrigPHIsToFix.size()) {
3459    assert(EnableVPlanNativePath &&
3460           "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3461    fixNonInductionPHIs();
3462  }
3463
3464  // At this point every instruction in the original loop is widened to a
3465  // vector form. Now we need to fix the recurrences in the loop. These PHI
3466  // nodes are currently empty because we did not want to introduce cycles.
3467  // This is the second stage of vectorizing recurrences.
3468  fixCrossIterationPHIs();
3469
3470  // Forget the original basic block.
3471  PSE.getSE()->forgetLoop(OrigLoop);
3472
3473  // Fix-up external users of the induction variables.
3474  for (auto &Entry : *Legal->getInductionVars())
3475    fixupIVUsers(Entry.first, Entry.second,
3476                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3477                 IVEndValues[Entry.first], LoopMiddleBlock);
3478
3479  fixLCSSAPHIs();
3480  for (Instruction *PI : PredicatedInstructions)
3481    sinkScalarOperands(&*PI);
3482
3483  // Remove redundant induction instructions.
3484  cse(LoopVectorBody);
3485}
3486
3487void InnerLoopVectorizer::fixCrossIterationPHIs() {
3488  // In order to support recurrences we need to be able to vectorize Phi nodes.
3489  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3490  // stage #2: We now need to fix the recurrences by adding incoming edges to
3491  // the currently empty PHI nodes. At this point every instruction in the
3492  // original loop is widened to a vector form so we can use them to construct
3493  // the incoming edges.
3494  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3495    // Handle first-order recurrences and reductions that need to be fixed.
3496    if (Legal->isFirstOrderRecurrence(&Phi))
3497      fixFirstOrderRecurrence(&Phi);
3498    else if (Legal->isReductionVariable(&Phi))
3499      fixReduction(&Phi);
3500  }
3501}
3502
3503void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3504  // This is the second phase of vectorizing first-order recurrences. An
3505  // overview of the transformation is described below. Suppose we have the
3506  // following loop.
3507  //
3508  //   for (int i = 0; i < n; ++i)
3509  //     b[i] = a[i] - a[i - 1];
3510  //
3511  // There is a first-order recurrence on "a". For this loop, the shorthand
3512  // scalar IR looks like:
3513  //
3514  //   scalar.ph:
3515  //     s_init = a[-1]
3516  //     br scalar.body
3517  //
3518  //   scalar.body:
3519  //     i = phi [0, scalar.ph], [i+1, scalar.body]
3520  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3521  //     s2 = a[i]
3522  //     b[i] = s2 - s1
3523  //     br cond, scalar.body, ...
3524  //
3525  // In this example, s1 is a recurrence because it's value depends on the
3526  // previous iteration. In the first phase of vectorization, we created a
3527  // temporary value for s1. We now complete the vectorization and produce the
3528  // shorthand vector IR shown below (for VF = 4, UF = 1).
3529  //
3530  //   vector.ph:
3531  //     v_init = vector(..., ..., ..., a[-1])
3532  //     br vector.body
3533  //
3534  //   vector.body
3535  //     i = phi [0, vector.ph], [i+4, vector.body]
3536  //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3537  //     v2 = a[i, i+1, i+2, i+3];
3538  //     v3 = vector(v1(3), v2(0, 1, 2))
3539  //     b[i, i+1, i+2, i+3] = v2 - v3
3540  //     br cond, vector.body, middle.block
3541  //
3542  //   middle.block:
3543  //     x = v2(3)
3544  //     br scalar.ph
3545  //
3546  //   scalar.ph:
3547  //     s_init = phi [x, middle.block], [a[-1], otherwise]
3548  //     br scalar.body
3549  //
3550  // After execution completes the vector loop, we extract the next value of
3551  // the recurrence (x) to use as the initial value in the scalar loop.
3552
3553  // Get the original loop preheader and single loop latch.
3554  auto *Preheader = OrigLoop->getLoopPreheader();
3555  auto *Latch = OrigLoop->getLoopLatch();
3556
3557  // Get the initial and previous values of the scalar recurrence.
3558  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3559  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3560
3561  // Create a vector from the initial value.
3562  auto *VectorInit = ScalarInit;
3563  if (VF > 1) {
3564    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3565    VectorInit = Builder.CreateInsertElement(
3566        UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3567        Builder.getInt32(VF - 1), "vector.recur.init");
3568  }
3569
3570  // We constructed a temporary phi node in the first phase of vectorization.
3571  // This phi node will eventually be deleted.
3572  Builder.SetInsertPoint(
3573      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3574
3575  // Create a phi node for the new recurrence. The current value will either be
3576  // the initial value inserted into a vector or loop-varying vector value.
3577  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3578  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3579
3580  // Get the vectorized previous value of the last part UF - 1. It appears last
3581  // among all unrolled iterations, due to the order of their construction.
3582  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3583
3584  // Find and set the insertion point after the previous value if it is an
3585  // instruction.
3586  BasicBlock::iterator InsertPt;
3587  // Note that the previous value may have been constant-folded so it is not
3588  // guaranteed to be an instruction in the vector loop.
3589  // FIXME: Loop invariant values do not form recurrences. We should deal with
3590  //        them earlier.
3591  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3592    InsertPt = LoopVectorBody->getFirstInsertionPt();
3593  else {
3594    Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3595    if (isa<PHINode>(PreviousLastPart))
3596      // If the previous value is a phi node, we should insert after all the phi
3597      // nodes in the block containing the PHI to avoid breaking basic block
3598      // verification. Note that the basic block may be different to
3599      // LoopVectorBody, in case we predicate the loop.
3600      InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3601    else
3602      InsertPt = ++PreviousInst->getIterator();
3603  }
3604  Builder.SetInsertPoint(&*InsertPt);
3605
3606  // We will construct a vector for the recurrence by combining the values for
3607  // the current and previous iterations. This is the required shuffle mask.
3608  SmallVector<Constant *, 8> ShuffleMask(VF);
3609  ShuffleMask[0] = Builder.getInt32(VF - 1);
3610  for (unsigned I = 1; I < VF; ++I)
3611    ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3612
3613  // The vector from which to take the initial value for the current iteration
3614  // (actual or unrolled). Initially, this is the vector phi node.
3615  Value *Incoming = VecPhi;
3616
3617  // Shuffle the current and previous vector and update the vector parts.
3618  for (unsigned Part = 0; Part < UF; ++Part) {
3619    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3620    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3621    auto *Shuffle =
3622        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3623                                             ConstantVector::get(ShuffleMask))
3624               : Incoming;
3625    PhiPart->replaceAllUsesWith(Shuffle);
3626    cast<Instruction>(PhiPart)->eraseFromParent();
3627    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3628    Incoming = PreviousPart;
3629  }
3630
3631  // Fix the latch value of the new recurrence in the vector loop.
3632  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3633
3634  // Extract the last vector element in the middle block. This will be the
3635  // initial value for the recurrence when jumping to the scalar loop.
3636  auto *ExtractForScalar = Incoming;
3637  if (VF > 1) {
3638    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3639    ExtractForScalar = Builder.CreateExtractElement(
3640        ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3641  }
3642  // Extract the second last element in the middle block if the
3643  // Phi is used outside the loop. We need to extract the phi itself
3644  // and not the last element (the phi update in the current iteration). This
3645  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3646  // when the scalar loop is not run at all.
3647  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3648  if (VF > 1)
3649    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3650        Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3651  // When loop is unrolled without vectorizing, initialize
3652  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3653  // `Incoming`. This is analogous to the vectorized case above: extracting the
3654  // second last element when VF > 1.
3655  else if (UF > 1)
3656    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3657
3658  // Fix the initial value of the original recurrence in the scalar loop.
3659  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3660  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3661  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3662    auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3663    Start->addIncoming(Incoming, BB);
3664  }
3665
3666  Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3667  Phi->setName("scalar.recur");
3668
3669  // Finally, fix users of the recurrence outside the loop. The users will need
3670  // either the last value of the scalar recurrence or the last value of the
3671  // vector recurrence we extracted in the middle block. Since the loop is in
3672  // LCSSA form, we just need to find all the phi nodes for the original scalar
3673  // recurrence in the exit block, and then add an edge for the middle block.
3674  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3675    if (LCSSAPhi.getIncomingValue(0) == Phi) {
3676      LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3677    }
3678  }
3679}
3680
3681void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3682  Constant *Zero = Builder.getInt32(0);
3683
3684  // Get it's reduction variable descriptor.
3685  assert(Legal->isReductionVariable(Phi) &&
3686         "Unable to find the reduction variable");
3687  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3688
3689  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3690  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3691  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3692  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3693    RdxDesc.getMinMaxRecurrenceKind();
3694  setDebugLocFromInst(Builder, ReductionStartValue);
3695
3696  // We need to generate a reduction vector from the incoming scalar.
3697  // To do so, we need to generate the 'identity' vector and override
3698  // one of the elements with the incoming scalar reduction. We need
3699  // to do it in the vector-loop preheader.
3700  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3701
3702  // This is the vector-clone of the value that leaves the loop.
3703  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3704
3705  // Find the reduction identity variable. Zero for addition, or, xor,
3706  // one for multiplication, -1 for And.
3707  Value *Identity;
3708  Value *VectorStart;
3709  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3710      RK == RecurrenceDescriptor::RK_FloatMinMax) {
3711    // MinMax reduction have the start value as their identify.
3712    if (VF == 1) {
3713      VectorStart = Identity = ReductionStartValue;
3714    } else {
3715      VectorStart = Identity =
3716        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3717    }
3718  } else {
3719    // Handle other reduction kinds:
3720    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3721        RK, VecTy->getScalarType());
3722    if (VF == 1) {
3723      Identity = Iden;
3724      // This vector is the Identity vector where the first element is the
3725      // incoming scalar reduction.
3726      VectorStart = ReductionStartValue;
3727    } else {
3728      Identity = ConstantVector::getSplat(VF, Iden);
3729
3730      // This vector is the Identity vector where the first element is the
3731      // incoming scalar reduction.
3732      VectorStart =
3733        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3734    }
3735  }
3736
3737  // Wrap flags are in general invalid after vectorization, clear them.
3738  clearReductionWrapFlags(RdxDesc);
3739
3740  // Fix the vector-loop phi.
3741
3742  // Reductions do not have to start at zero. They can start with
3743  // any loop invariant values.
3744  BasicBlock *Latch = OrigLoop->getLoopLatch();
3745  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3746
3747  for (unsigned Part = 0; Part < UF; ++Part) {
3748    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3749    Value *Val = getOrCreateVectorValue(LoopVal, Part);
3750    // Make sure to add the reduction start value only to the
3751    // first unroll part.
3752    Value *StartVal = (Part == 0) ? VectorStart : Identity;
3753    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3754    cast<PHINode>(VecRdxPhi)
3755      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3756  }
3757
3758  // Before each round, move the insertion point right between
3759  // the PHIs and the values we are going to write.
3760  // This allows us to write both PHINodes and the extractelement
3761  // instructions.
3762  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3763
3764  setDebugLocFromInst(Builder, LoopExitInst);
3765
3766  // If tail is folded by masking, the vector value to leave the loop should be
3767  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3768  // instead of the former.
3769  if (Cost->foldTailByMasking()) {
3770    for (unsigned Part = 0; Part < UF; ++Part) {
3771      Value *VecLoopExitInst =
3772          VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3773      Value *Sel = nullptr;
3774      for (User *U : VecLoopExitInst->users()) {
3775        if (isa<SelectInst>(U)) {
3776          assert(!Sel && "Reduction exit feeding two selects");
3777          Sel = U;
3778        } else
3779          assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3780      }
3781      assert(Sel && "Reduction exit feeds no select");
3782      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3783    }
3784  }
3785
3786  // If the vector reduction can be performed in a smaller type, we truncate
3787  // then extend the loop exit value to enable InstCombine to evaluate the
3788  // entire expression in the smaller type.
3789  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3790    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3791    Builder.SetInsertPoint(
3792        LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3793    VectorParts RdxParts(UF);
3794    for (unsigned Part = 0; Part < UF; ++Part) {
3795      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3796      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3797      Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3798                                        : Builder.CreateZExt(Trunc, VecTy);
3799      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3800           UI != RdxParts[Part]->user_end();)
3801        if (*UI != Trunc) {
3802          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3803          RdxParts[Part] = Extnd;
3804        } else {
3805          ++UI;
3806        }
3807    }
3808    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3809    for (unsigned Part = 0; Part < UF; ++Part) {
3810      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3811      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3812    }
3813  }
3814
3815  // Reduce all of the unrolled parts into a single vector.
3816  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3817  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3818
3819  // The middle block terminator has already been assigned a DebugLoc here (the
3820  // OrigLoop's single latch terminator). We want the whole middle block to
3821  // appear to execute on this line because: (a) it is all compiler generated,
3822  // (b) these instructions are always executed after evaluating the latch
3823  // conditional branch, and (c) other passes may add new predecessors which
3824  // terminate on this line. This is the easiest way to ensure we don't
3825  // accidentally cause an extra step back into the loop while debugging.
3826  setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3827  for (unsigned Part = 1; Part < UF; ++Part) {
3828    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3829    if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3830      // Floating point operations had to be 'fast' to enable the reduction.
3831      ReducedPartRdx = addFastMathFlag(
3832          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3833                              ReducedPartRdx, "bin.rdx"),
3834          RdxDesc.getFastMathFlags());
3835    else
3836      ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3837                                      RdxPart);
3838  }
3839
3840  if (VF > 1) {
3841    bool NoNaN = Legal->hasFunNoNaNAttr();
3842    ReducedPartRdx =
3843        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3844    // If the reduction can be performed in a smaller type, we need to extend
3845    // the reduction to the wider type before we branch to the original loop.
3846    if (Phi->getType() != RdxDesc.getRecurrenceType())
3847      ReducedPartRdx =
3848        RdxDesc.isSigned()
3849        ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3850        : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3851  }
3852
3853  // Create a phi node that merges control-flow from the backedge-taken check
3854  // block and the middle block.
3855  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3856                                        LoopScalarPreHeader->getTerminator());
3857  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3858    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3859  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3860
3861  // Now, we need to fix the users of the reduction variable
3862  // inside and outside of the scalar remainder loop.
3863  // We know that the loop is in LCSSA form. We need to update the
3864  // PHI nodes in the exit blocks.
3865  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3866    // All PHINodes need to have a single entry edge, or two if
3867    // we already fixed them.
3868    assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3869
3870    // We found a reduction value exit-PHI. Update it with the
3871    // incoming bypass edge.
3872    if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3873      LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3874  } // end of the LCSSA phi scan.
3875
3876    // Fix the scalar loop reduction variable with the incoming reduction sum
3877    // from the vector body and from the backedge value.
3878  int IncomingEdgeBlockIdx =
3879    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3880  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3881  // Pick the other block.
3882  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3883  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3884  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3885}
3886
3887void InnerLoopVectorizer::clearReductionWrapFlags(
3888    RecurrenceDescriptor &RdxDesc) {
3889  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3890  if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3891      RK != RecurrenceDescriptor::RK_IntegerMult)
3892    return;
3893
3894  Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3895  assert(LoopExitInstr && "null loop exit instruction");
3896  SmallVector<Instruction *, 8> Worklist;
3897  SmallPtrSet<Instruction *, 8> Visited;
3898  Worklist.push_back(LoopExitInstr);
3899  Visited.insert(LoopExitInstr);
3900
3901  while (!Worklist.empty()) {
3902    Instruction *Cur = Worklist.pop_back_val();
3903    if (isa<OverflowingBinaryOperator>(Cur))
3904      for (unsigned Part = 0; Part < UF; ++Part) {
3905        Value *V = getOrCreateVectorValue(Cur, Part);
3906        cast<Instruction>(V)->dropPoisonGeneratingFlags();
3907      }
3908
3909    for (User *U : Cur->users()) {
3910      Instruction *UI = cast<Instruction>(U);
3911      if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3912          Visited.insert(UI).second)
3913        Worklist.push_back(UI);
3914    }
3915  }
3916}
3917
3918void InnerLoopVectorizer::fixLCSSAPHIs() {
3919  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3920    if (LCSSAPhi.getNumIncomingValues() == 1) {
3921      auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3922      // Non-instruction incoming values will have only one value.
3923      unsigned LastLane = 0;
3924      if (isa<Instruction>(IncomingValue))
3925          LastLane = Cost->isUniformAfterVectorization(
3926                         cast<Instruction>(IncomingValue), VF)
3927                         ? 0
3928                         : VF - 1;
3929      // Can be a loop invariant incoming value or the last scalar value to be
3930      // extracted from the vectorized loop.
3931      Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3932      Value *lastIncomingValue =
3933          getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3934      LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3935    }
3936  }
3937}
3938
3939void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3940  // The basic block and loop containing the predicated instruction.
3941  auto *PredBB = PredInst->getParent();
3942  auto *VectorLoop = LI->getLoopFor(PredBB);
3943
3944  // Initialize a worklist with the operands of the predicated instruction.
3945  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3946
3947  // Holds instructions that we need to analyze again. An instruction may be
3948  // reanalyzed if we don't yet know if we can sink it or not.
3949  SmallVector<Instruction *, 8> InstsToReanalyze;
3950
3951  // Returns true if a given use occurs in the predicated block. Phi nodes use
3952  // their operands in their corresponding predecessor blocks.
3953  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3954    auto *I = cast<Instruction>(U.getUser());
3955    BasicBlock *BB = I->getParent();
3956    if (auto *Phi = dyn_cast<PHINode>(I))
3957      BB = Phi->getIncomingBlock(
3958          PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3959    return BB == PredBB;
3960  };
3961
3962  // Iteratively sink the scalarized operands of the predicated instruction
3963  // into the block we created for it. When an instruction is sunk, it's
3964  // operands are then added to the worklist. The algorithm ends after one pass
3965  // through the worklist doesn't sink a single instruction.
3966  bool Changed;
3967  do {
3968    // Add the instructions that need to be reanalyzed to the worklist, and
3969    // reset the changed indicator.
3970    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3971    InstsToReanalyze.clear();
3972    Changed = false;
3973
3974    while (!Worklist.empty()) {
3975      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3976
3977      // We can't sink an instruction if it is a phi node, is already in the
3978      // predicated block, is not in the loop, or may have side effects.
3979      if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3980          !VectorLoop->contains(I) || I->mayHaveSideEffects())
3981        continue;
3982
3983      // It's legal to sink the instruction if all its uses occur in the
3984      // predicated block. Otherwise, there's nothing to do yet, and we may
3985      // need to reanalyze the instruction.
3986      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3987        InstsToReanalyze.push_back(I);
3988        continue;
3989      }
3990
3991      // Move the instruction to the beginning of the predicated block, and add
3992      // it's operands to the worklist.
3993      I->moveBefore(&*PredBB->getFirstInsertionPt());
3994      Worklist.insert(I->op_begin(), I->op_end());
3995
3996      // The sinking may have enabled other instructions to be sunk, so we will
3997      // need to iterate.
3998      Changed = true;
3999    }
4000  } while (Changed);
4001}
4002
4003void InnerLoopVectorizer::fixNonInductionPHIs() {
4004  for (PHINode *OrigPhi : OrigPHIsToFix) {
4005    PHINode *NewPhi =
4006        cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4007    unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4008
4009    SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4010        predecessors(OrigPhi->getParent()));
4011    SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4012        predecessors(NewPhi->getParent()));
4013    assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4014           "Scalar and Vector BB should have the same number of predecessors");
4015
4016    // The insertion point in Builder may be invalidated by the time we get
4017    // here. Force the Builder insertion point to something valid so that we do
4018    // not run into issues during insertion point restore in
4019    // getOrCreateVectorValue calls below.
4020    Builder.SetInsertPoint(NewPhi);
4021
4022    // The predecessor order is preserved and we can rely on mapping between
4023    // scalar and vector block predecessors.
4024    for (unsigned i = 0; i < NumIncomingValues; ++i) {
4025      BasicBlock *NewPredBB = VectorBBPredecessors[i];
4026
4027      // When looking up the new scalar/vector values to fix up, use incoming
4028      // values from original phi.
4029      Value *ScIncV =
4030          OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4031
4032      // Scalar incoming value may need a broadcast
4033      Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4034      NewPhi->addIncoming(NewIncV, NewPredBB);
4035    }
4036  }
4037}
4038
4039void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4040                                   unsigned VF, bool IsPtrLoopInvariant,
4041                                   SmallBitVector &IsIndexLoopInvariant) {
4042  // Construct a vector GEP by widening the operands of the scalar GEP as
4043  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4044  // results in a vector of pointers when at least one operand of the GEP
4045  // is vector-typed. Thus, to keep the representation compact, we only use
4046  // vector-typed operands for loop-varying values.
4047
4048  if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4049    // If we are vectorizing, but the GEP has only loop-invariant operands,
4050    // the GEP we build (by only using vector-typed operands for
4051    // loop-varying values) would be a scalar pointer. Thus, to ensure we
4052    // produce a vector of pointers, we need to either arbitrarily pick an
4053    // operand to broadcast, or broadcast a clone of the original GEP.
4054    // Here, we broadcast a clone of the original.
4055    //
4056    // TODO: If at some point we decide to scalarize instructions having
4057    //       loop-invariant operands, this special case will no longer be
4058    //       required. We would add the scalarization decision to
4059    //       collectLoopScalars() and teach getVectorValue() to broadcast
4060    //       the lane-zero scalar value.
4061    auto *Clone = Builder.Insert(GEP->clone());
4062    for (unsigned Part = 0; Part < UF; ++Part) {
4063      Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4064      VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4065      addMetadata(EntryPart, GEP);
4066    }
4067  } else {
4068    // If the GEP has at least one loop-varying operand, we are sure to
4069    // produce a vector of pointers. But if we are only unrolling, we want
4070    // to produce a scalar GEP for each unroll part. Thus, the GEP we
4071    // produce with the code below will be scalar (if VF == 1) or vector
4072    // (otherwise). Note that for the unroll-only case, we still maintain
4073    // values in the vector mapping with initVector, as we do for other
4074    // instructions.
4075    for (unsigned Part = 0; Part < UF; ++Part) {
4076      // The pointer operand of the new GEP. If it's loop-invariant, we
4077      // won't broadcast it.
4078      auto *Ptr = IsPtrLoopInvariant
4079                      ? GEP->getPointerOperand()
4080                      : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4081
4082      // Collect all the indices for the new GEP. If any index is
4083      // loop-invariant, we won't broadcast it.
4084      SmallVector<Value *, 4> Indices;
4085      for (auto Index : enumerate(GEP->indices())) {
4086        Value *User = Index.value().get();
4087        if (IsIndexLoopInvariant[Index.index()])
4088          Indices.push_back(User);
4089        else
4090          Indices.push_back(getOrCreateVectorValue(User, Part));
4091      }
4092
4093      // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4094      // but it should be a vector, otherwise.
4095      auto *NewGEP =
4096          GEP->isInBounds()
4097              ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4098                                          Indices)
4099              : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4100      assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4101             "NewGEP is not a pointer vector");
4102      VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4103      addMetadata(NewGEP, GEP);
4104    }
4105  }
4106}
4107
4108void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4109                                              unsigned VF) {
4110  PHINode *P = cast<PHINode>(PN);
4111  if (EnableVPlanNativePath) {
4112    // Currently we enter here in the VPlan-native path for non-induction
4113    // PHIs where all control flow is uniform. We simply widen these PHIs.
4114    // Create a vector phi with no operands - the vector phi operands will be
4115    // set at the end of vector code generation.
4116    Type *VecTy =
4117        (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4118    Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4119    VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4120    OrigPHIsToFix.push_back(P);
4121
4122    return;
4123  }
4124
4125  assert(PN->getParent() == OrigLoop->getHeader() &&
4126         "Non-header phis should have been handled elsewhere");
4127
4128  // In order to support recurrences we need to be able to vectorize Phi nodes.
4129  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4130  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4131  // this value when we vectorize all of the instructions that use the PHI.
4132  if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4133    for (unsigned Part = 0; Part < UF; ++Part) {
4134      // This is phase one of vectorizing PHIs.
4135      Type *VecTy =
4136          (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4137      Value *EntryPart = PHINode::Create(
4138          VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4139      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4140    }
4141    return;
4142  }
4143
4144  setDebugLocFromInst(Builder, P);
4145
4146  // This PHINode must be an induction variable.
4147  // Make sure that we know about it.
4148  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4149
4150  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4151  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4152
4153  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4154  // which can be found from the original scalar operations.
4155  switch (II.getKind()) {
4156  case InductionDescriptor::IK_NoInduction:
4157    llvm_unreachable("Unknown induction");
4158  case InductionDescriptor::IK_IntInduction:
4159  case InductionDescriptor::IK_FpInduction:
4160    llvm_unreachable("Integer/fp induction is handled elsewhere.");
4161  case InductionDescriptor::IK_PtrInduction: {
4162    // Handle the pointer induction variable case.
4163    assert(P->getType()->isPointerTy() && "Unexpected type.");
4164    // This is the normalized GEP that starts counting at zero.
4165    Value *PtrInd = Induction;
4166    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4167    // Determine the number of scalars we need to generate for each unroll
4168    // iteration. If the instruction is uniform, we only need to generate the
4169    // first lane. Otherwise, we generate all VF values.
4170    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4171    // These are the scalar results. Notice that we don't generate vector GEPs
4172    // because scalar GEPs result in better code.
4173    for (unsigned Part = 0; Part < UF; ++Part) {
4174      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4175        Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4176        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4177        Value *SclrGep =
4178            emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4179        SclrGep->setName("next.gep");
4180        VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4181      }
4182    }
4183    return;
4184  }
4185  }
4186}
4187
4188/// A helper function for checking whether an integer division-related
4189/// instruction may divide by zero (in which case it must be predicated if
4190/// executed conditionally in the scalar code).
4191/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4192/// Non-zero divisors that are non compile-time constants will not be
4193/// converted into multiplication, so we will still end up scalarizing
4194/// the division, but can do so w/o predication.
4195static bool mayDivideByZero(Instruction &I) {
4196  assert((I.getOpcode() == Instruction::UDiv ||
4197          I.getOpcode() == Instruction::SDiv ||
4198          I.getOpcode() == Instruction::URem ||
4199          I.getOpcode() == Instruction::SRem) &&
4200         "Unexpected instruction");
4201  Value *Divisor = I.getOperand(1);
4202  auto *CInt = dyn_cast<ConstantInt>(Divisor);
4203  return !CInt || CInt->isZero();
4204}
4205
4206void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4207  switch (I.getOpcode()) {
4208  case Instruction::Br:
4209  case Instruction::PHI:
4210  case Instruction::GetElementPtr:
4211    llvm_unreachable("This instruction is handled by a different recipe.");
4212  case Instruction::UDiv:
4213  case Instruction::SDiv:
4214  case Instruction::SRem:
4215  case Instruction::URem:
4216  case Instruction::Add:
4217  case Instruction::FAdd:
4218  case Instruction::Sub:
4219  case Instruction::FSub:
4220  case Instruction::FNeg:
4221  case Instruction::Mul:
4222  case Instruction::FMul:
4223  case Instruction::FDiv:
4224  case Instruction::FRem:
4225  case Instruction::Shl:
4226  case Instruction::LShr:
4227  case Instruction::AShr:
4228  case Instruction::And:
4229  case Instruction::Or:
4230  case Instruction::Xor: {
4231    // Just widen unops and binops.
4232    setDebugLocFromInst(Builder, &I);
4233
4234    for (unsigned Part = 0; Part < UF; ++Part) {
4235      SmallVector<Value *, 2> Ops;
4236      for (Value *Op : I.operands())
4237        Ops.push_back(getOrCreateVectorValue(Op, Part));
4238
4239      Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4240
4241      if (auto *VecOp = dyn_cast<Instruction>(V))
4242        VecOp->copyIRFlags(&I);
4243
4244      // Use this vector value for all users of the original instruction.
4245      VectorLoopValueMap.setVectorValue(&I, Part, V);
4246      addMetadata(V, &I);
4247    }
4248
4249    break;
4250  }
4251  case Instruction::Select: {
4252    // Widen selects.
4253    // If the selector is loop invariant we can create a select
4254    // instruction with a scalar condition. Otherwise, use vector-select.
4255    auto *SE = PSE.getSE();
4256    bool InvariantCond =
4257        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4258    setDebugLocFromInst(Builder, &I);
4259
4260    // The condition can be loop invariant  but still defined inside the
4261    // loop. This means that we can't just use the original 'cond' value.
4262    // We have to take the 'vectorized' value and pick the first lane.
4263    // Instcombine will make this a no-op.
4264
4265    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4266
4267    for (unsigned Part = 0; Part < UF; ++Part) {
4268      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4269      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4270      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4271      Value *Sel =
4272          Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4273      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4274      addMetadata(Sel, &I);
4275    }
4276
4277    break;
4278  }
4279
4280  case Instruction::ICmp:
4281  case Instruction::FCmp: {
4282    // Widen compares. Generate vector compares.
4283    bool FCmp = (I.getOpcode() == Instruction::FCmp);
4284    auto *Cmp = cast<CmpInst>(&I);
4285    setDebugLocFromInst(Builder, Cmp);
4286    for (unsigned Part = 0; Part < UF; ++Part) {
4287      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4288      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4289      Value *C = nullptr;
4290      if (FCmp) {
4291        // Propagate fast math flags.
4292        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4293        Builder.setFastMathFlags(Cmp->getFastMathFlags());
4294        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4295      } else {
4296        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4297      }
4298      VectorLoopValueMap.setVectorValue(&I, Part, C);
4299      addMetadata(C, &I);
4300    }
4301
4302    break;
4303  }
4304
4305  case Instruction::ZExt:
4306  case Instruction::SExt:
4307  case Instruction::FPToUI:
4308  case Instruction::FPToSI:
4309  case Instruction::FPExt:
4310  case Instruction::PtrToInt:
4311  case Instruction::IntToPtr:
4312  case Instruction::SIToFP:
4313  case Instruction::UIToFP:
4314  case Instruction::Trunc:
4315  case Instruction::FPTrunc:
4316  case Instruction::BitCast: {
4317    auto *CI = cast<CastInst>(&I);
4318    setDebugLocFromInst(Builder, CI);
4319
4320    /// Vectorize casts.
4321    Type *DestTy =
4322        (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4323
4324    for (unsigned Part = 0; Part < UF; ++Part) {
4325      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4326      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4327      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4328      addMetadata(Cast, &I);
4329    }
4330    break;
4331  }
4332
4333  case Instruction::Call: {
4334    // Ignore dbg intrinsics.
4335    if (isa<DbgInfoIntrinsic>(I))
4336      break;
4337    setDebugLocFromInst(Builder, &I);
4338
4339    Module *M = I.getParent()->getParent()->getParent();
4340    auto *CI = cast<CallInst>(&I);
4341
4342    StringRef FnName = CI->getCalledFunction()->getName();
4343    Function *F = CI->getCalledFunction();
4344    Type *RetTy = ToVectorTy(CI->getType(), VF);
4345    SmallVector<Type *, 4> Tys;
4346    for (Value *ArgOperand : CI->arg_operands())
4347      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4348
4349    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4350
4351    // The flag shows whether we use Intrinsic or a usual Call for vectorized
4352    // version of the instruction.
4353    // Is it beneficial to perform intrinsic call compared to lib call?
4354    bool NeedToScalarize;
4355    unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4356    bool UseVectorIntrinsic =
4357        ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4358    assert((UseVectorIntrinsic || !NeedToScalarize) &&
4359           "Instruction should be scalarized elsewhere.");
4360
4361    for (unsigned Part = 0; Part < UF; ++Part) {
4362      SmallVector<Value *, 4> Args;
4363      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4364        Value *Arg = CI->getArgOperand(i);
4365        // Some intrinsics have a scalar argument - don't replace it with a
4366        // vector.
4367        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4368          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4369        Args.push_back(Arg);
4370      }
4371
4372      Function *VectorF;
4373      if (UseVectorIntrinsic) {
4374        // Use vector version of the intrinsic.
4375        Type *TysForDecl[] = {CI->getType()};
4376        if (VF > 1)
4377          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4378        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4379      } else {
4380        // Use vector version of the library call.
4381        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4382        assert(!VFnName.empty() && "Vector function name is empty.");
4383        VectorF = M->getFunction(VFnName);
4384        if (!VectorF) {
4385          // Generate a declaration
4386          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4387          VectorF =
4388              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4389          VectorF->copyAttributesFrom(F);
4390        }
4391      }
4392      assert(VectorF && "Can't create vector function.");
4393
4394      SmallVector<OperandBundleDef, 1> OpBundles;
4395      CI->getOperandBundlesAsDefs(OpBundles);
4396      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4397
4398      if (isa<FPMathOperator>(V))
4399        V->copyFastMathFlags(CI);
4400
4401      VectorLoopValueMap.setVectorValue(&I, Part, V);
4402      addMetadata(V, &I);
4403    }
4404
4405    break;
4406  }
4407
4408  default:
4409    // This instruction is not vectorized by simple widening.
4410    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4411    llvm_unreachable("Unhandled instruction!");
4412  } // end of switch.
4413}
4414
4415void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4416  // We should not collect Scalars more than once per VF. Right now, this
4417  // function is called from collectUniformsAndScalars(), which already does
4418  // this check. Collecting Scalars for VF=1 does not make any sense.
4419  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4420         "This function should not be visited twice for the same VF");
4421
4422  SmallSetVector<Instruction *, 8> Worklist;
4423
4424  // These sets are used to seed the analysis with pointers used by memory
4425  // accesses that will remain scalar.
4426  SmallSetVector<Instruction *, 8> ScalarPtrs;
4427  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4428
4429  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4430  // The pointer operands of loads and stores will be scalar as long as the
4431  // memory access is not a gather or scatter operation. The value operand of a
4432  // store will remain scalar if the store is scalarized.
4433  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4434    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4435    assert(WideningDecision != CM_Unknown &&
4436           "Widening decision should be ready at this moment");
4437    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4438      if (Ptr == Store->getValueOperand())
4439        return WideningDecision == CM_Scalarize;
4440    assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4441           "Ptr is neither a value or pointer operand");
4442    return WideningDecision != CM_GatherScatter;
4443  };
4444
4445  // A helper that returns true if the given value is a bitcast or
4446  // getelementptr instruction contained in the loop.
4447  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4448    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4449            isa<GetElementPtrInst>(V)) &&
4450           !TheLoop->isLoopInvariant(V);
4451  };
4452
4453  // A helper that evaluates a memory access's use of a pointer. If the use
4454  // will be a scalar use, and the pointer is only used by memory accesses, we
4455  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4456  // PossibleNonScalarPtrs.
4457  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4458    // We only care about bitcast and getelementptr instructions contained in
4459    // the loop.
4460    if (!isLoopVaryingBitCastOrGEP(Ptr))
4461      return;
4462
4463    // If the pointer has already been identified as scalar (e.g., if it was
4464    // also identified as uniform), there's nothing to do.
4465    auto *I = cast<Instruction>(Ptr);
4466    if (Worklist.count(I))
4467      return;
4468
4469    // If the use of the pointer will be a scalar use, and all users of the
4470    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4471    // place the pointer in PossibleNonScalarPtrs.
4472    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4473          return isa<LoadInst>(U) || isa<StoreInst>(U);
4474        }))
4475      ScalarPtrs.insert(I);
4476    else
4477      PossibleNonScalarPtrs.insert(I);
4478  };
4479
4480  // We seed the scalars analysis with three classes of instructions: (1)
4481  // instructions marked uniform-after-vectorization, (2) bitcast and
4482  // getelementptr instructions used by memory accesses requiring a scalar use,
4483  // and (3) pointer induction variables and their update instructions (we
4484  // currently only scalarize these).
4485  //
4486  // (1) Add to the worklist all instructions that have been identified as
4487  // uniform-after-vectorization.
4488  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4489
4490  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4491  // memory accesses requiring a scalar use. The pointer operands of loads and
4492  // stores will be scalar as long as the memory accesses is not a gather or
4493  // scatter operation. The value operand of a store will remain scalar if the
4494  // store is scalarized.
4495  for (auto *BB : TheLoop->blocks())
4496    for (auto &I : *BB) {
4497      if (auto *Load = dyn_cast<LoadInst>(&I)) {
4498        evaluatePtrUse(Load, Load->getPointerOperand());
4499      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4500        evaluatePtrUse(Store, Store->getPointerOperand());
4501        evaluatePtrUse(Store, Store->getValueOperand());
4502      }
4503    }
4504  for (auto *I : ScalarPtrs)
4505    if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4506      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4507      Worklist.insert(I);
4508    }
4509
4510  // (3) Add to the worklist all pointer induction variables and their update
4511  // instructions.
4512  //
4513  // TODO: Once we are able to vectorize pointer induction variables we should
4514  //       no longer insert them into the worklist here.
4515  auto *Latch = TheLoop->getLoopLatch();
4516  for (auto &Induction : *Legal->getInductionVars()) {
4517    auto *Ind = Induction.first;
4518    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4519    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4520      continue;
4521    Worklist.insert(Ind);
4522    Worklist.insert(IndUpdate);
4523    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4524    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4525                      << "\n");
4526  }
4527
4528  // Insert the forced scalars.
4529  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4530  // induction variable when the PHI user is scalarized.
4531  auto ForcedScalar = ForcedScalars.find(VF);
4532  if (ForcedScalar != ForcedScalars.end())
4533    for (auto *I : ForcedScalar->second)
4534      Worklist.insert(I);
4535
4536  // Expand the worklist by looking through any bitcasts and getelementptr
4537  // instructions we've already identified as scalar. This is similar to the
4538  // expansion step in collectLoopUniforms(); however, here we're only
4539  // expanding to include additional bitcasts and getelementptr instructions.
4540  unsigned Idx = 0;
4541  while (Idx != Worklist.size()) {
4542    Instruction *Dst = Worklist[Idx++];
4543    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4544      continue;
4545    auto *Src = cast<Instruction>(Dst->getOperand(0));
4546    if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4547          auto *J = cast<Instruction>(U);
4548          return !TheLoop->contains(J) || Worklist.count(J) ||
4549                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4550                  isScalarUse(J, Src));
4551        })) {
4552      Worklist.insert(Src);
4553      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4554    }
4555  }
4556
4557  // An induction variable will remain scalar if all users of the induction
4558  // variable and induction variable update remain scalar.
4559  for (auto &Induction : *Legal->getInductionVars()) {
4560    auto *Ind = Induction.first;
4561    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4562
4563    // We already considered pointer induction variables, so there's no reason
4564    // to look at their users again.
4565    //
4566    // TODO: Once we are able to vectorize pointer induction variables we
4567    //       should no longer skip over them here.
4568    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4569      continue;
4570
4571    // Determine if all users of the induction variable are scalar after
4572    // vectorization.
4573    auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4574      auto *I = cast<Instruction>(U);
4575      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4576    });
4577    if (!ScalarInd)
4578      continue;
4579
4580    // Determine if all users of the induction variable update instruction are
4581    // scalar after vectorization.
4582    auto ScalarIndUpdate =
4583        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4584          auto *I = cast<Instruction>(U);
4585          return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4586        });
4587    if (!ScalarIndUpdate)
4588      continue;
4589
4590    // The induction variable and its update instruction will remain scalar.
4591    Worklist.insert(Ind);
4592    Worklist.insert(IndUpdate);
4593    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4594    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4595                      << "\n");
4596  }
4597
4598  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4599}
4600
4601bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4602  if (!blockNeedsPredication(I->getParent()))
4603    return false;
4604  switch(I->getOpcode()) {
4605  default:
4606    break;
4607  case Instruction::Load:
4608  case Instruction::Store: {
4609    if (!Legal->isMaskRequired(I))
4610      return false;
4611    auto *Ptr = getLoadStorePointerOperand(I);
4612    auto *Ty = getMemInstValueType(I);
4613    // We have already decided how to vectorize this instruction, get that
4614    // result.
4615    if (VF > 1) {
4616      InstWidening WideningDecision = getWideningDecision(I, VF);
4617      assert(WideningDecision != CM_Unknown &&
4618             "Widening decision should be ready at this moment");
4619      return WideningDecision == CM_Scalarize;
4620    }
4621    const MaybeAlign Alignment = getLoadStoreAlignment(I);
4622    return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4623                                isLegalMaskedGather(Ty, Alignment))
4624                            : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4625                                isLegalMaskedScatter(Ty, Alignment));
4626  }
4627  case Instruction::UDiv:
4628  case Instruction::SDiv:
4629  case Instruction::SRem:
4630  case Instruction::URem:
4631    return mayDivideByZero(*I);
4632  }
4633  return false;
4634}
4635
4636bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4637                                                               unsigned VF) {
4638  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4639  assert(getWideningDecision(I, VF) == CM_Unknown &&
4640         "Decision should not be set yet.");
4641  auto *Group = getInterleavedAccessGroup(I);
4642  assert(Group && "Must have a group.");
4643
4644  // If the instruction's allocated size doesn't equal it's type size, it
4645  // requires padding and will be scalarized.
4646  auto &DL = I->getModule()->getDataLayout();
4647  auto *ScalarTy = getMemInstValueType(I);
4648  if (hasIrregularType(ScalarTy, DL, VF))
4649    return false;
4650
4651  // Check if masking is required.
4652  // A Group may need masking for one of two reasons: it resides in a block that
4653  // needs predication, or it was decided to use masking to deal with gaps.
4654  bool PredicatedAccessRequiresMasking =
4655      Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4656  bool AccessWithGapsRequiresMasking =
4657      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4658  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4659    return true;
4660
4661  // If masked interleaving is required, we expect that the user/target had
4662  // enabled it, because otherwise it either wouldn't have been created or
4663  // it should have been invalidated by the CostModel.
4664  assert(useMaskedInterleavedAccesses(TTI) &&
4665         "Masked interleave-groups for predicated accesses are not enabled.");
4666
4667  auto *Ty = getMemInstValueType(I);
4668  const MaybeAlign Alignment = getLoadStoreAlignment(I);
4669  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4670                          : TTI.isLegalMaskedStore(Ty, Alignment);
4671}
4672
4673bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4674                                                               unsigned VF) {
4675  // Get and ensure we have a valid memory instruction.
4676  LoadInst *LI = dyn_cast<LoadInst>(I);
4677  StoreInst *SI = dyn_cast<StoreInst>(I);
4678  assert((LI || SI) && "Invalid memory instruction");
4679
4680  auto *Ptr = getLoadStorePointerOperand(I);
4681
4682  // In order to be widened, the pointer should be consecutive, first of all.
4683  if (!Legal->isConsecutivePtr(Ptr))
4684    return false;
4685
4686  // If the instruction is a store located in a predicated block, it will be
4687  // scalarized.
4688  if (isScalarWithPredication(I))
4689    return false;
4690
4691  // If the instruction's allocated size doesn't equal it's type size, it
4692  // requires padding and will be scalarized.
4693  auto &DL = I->getModule()->getDataLayout();
4694  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4695  if (hasIrregularType(ScalarTy, DL, VF))
4696    return false;
4697
4698  return true;
4699}
4700
4701void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4702  // We should not collect Uniforms more than once per VF. Right now,
4703  // this function is called from collectUniformsAndScalars(), which
4704  // already does this check. Collecting Uniforms for VF=1 does not make any
4705  // sense.
4706
4707  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4708         "This function should not be visited twice for the same VF");
4709
4710  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4711  // not analyze again.  Uniforms.count(VF) will return 1.
4712  Uniforms[VF].clear();
4713
4714  // We now know that the loop is vectorizable!
4715  // Collect instructions inside the loop that will remain uniform after
4716  // vectorization.
4717
4718  // Global values, params and instructions outside of current loop are out of
4719  // scope.
4720  auto isOutOfScope = [&](Value *V) -> bool {
4721    Instruction *I = dyn_cast<Instruction>(V);
4722    return (!I || !TheLoop->contains(I));
4723  };
4724
4725  SetVector<Instruction *> Worklist;
4726  BasicBlock *Latch = TheLoop->getLoopLatch();
4727
4728  // Instructions that are scalar with predication must not be considered
4729  // uniform after vectorization, because that would create an erroneous
4730  // replicating region where only a single instance out of VF should be formed.
4731  // TODO: optimize such seldom cases if found important, see PR40816.
4732  auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4733    if (isScalarWithPredication(I, VF)) {
4734      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4735                        << *I << "\n");
4736      return;
4737    }
4738    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4739    Worklist.insert(I);
4740  };
4741
4742  // Start with the conditional branch. If the branch condition is an
4743  // instruction contained in the loop that is only used by the branch, it is
4744  // uniform.
4745  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4746  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4747    addToWorklistIfAllowed(Cmp);
4748
4749  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4750  // are pointers that are treated like consecutive pointers during
4751  // vectorization. The pointer operands of interleaved accesses are an
4752  // example.
4753  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4754
4755  // Holds pointer operands of instructions that are possibly non-uniform.
4756  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4757
4758  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4759    InstWidening WideningDecision = getWideningDecision(I, VF);
4760    assert(WideningDecision != CM_Unknown &&
4761           "Widening decision should be ready at this moment");
4762
4763    return (WideningDecision == CM_Widen ||
4764            WideningDecision == CM_Widen_Reverse ||
4765            WideningDecision == CM_Interleave);
4766  };
4767  // Iterate over the instructions in the loop, and collect all
4768  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4769  // that a consecutive-like pointer operand will be scalarized, we collect it
4770  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4771  // getelementptr instruction can be used by both vectorized and scalarized
4772  // memory instructions. For example, if a loop loads and stores from the same
4773  // location, but the store is conditional, the store will be scalarized, and
4774  // the getelementptr won't remain uniform.
4775  for (auto *BB : TheLoop->blocks())
4776    for (auto &I : *BB) {
4777      // If there's no pointer operand, there's nothing to do.
4778      auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4779      if (!Ptr)
4780        continue;
4781
4782      // True if all users of Ptr are memory accesses that have Ptr as their
4783      // pointer operand.
4784      auto UsersAreMemAccesses =
4785          llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4786            return getLoadStorePointerOperand(U) == Ptr;
4787          });
4788
4789      // Ensure the memory instruction will not be scalarized or used by
4790      // gather/scatter, making its pointer operand non-uniform. If the pointer
4791      // operand is used by any instruction other than a memory access, we
4792      // conservatively assume the pointer operand may be non-uniform.
4793      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4794        PossibleNonUniformPtrs.insert(Ptr);
4795
4796      // If the memory instruction will be vectorized and its pointer operand
4797      // is consecutive-like, or interleaving - the pointer operand should
4798      // remain uniform.
4799      else
4800        ConsecutiveLikePtrs.insert(Ptr);
4801    }
4802
4803  // Add to the Worklist all consecutive and consecutive-like pointers that
4804  // aren't also identified as possibly non-uniform.
4805  for (auto *V : ConsecutiveLikePtrs)
4806    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4807      addToWorklistIfAllowed(V);
4808
4809  // Expand Worklist in topological order: whenever a new instruction
4810  // is added , its users should be already inside Worklist.  It ensures
4811  // a uniform instruction will only be used by uniform instructions.
4812  unsigned idx = 0;
4813  while (idx != Worklist.size()) {
4814    Instruction *I = Worklist[idx++];
4815
4816    for (auto OV : I->operand_values()) {
4817      // isOutOfScope operands cannot be uniform instructions.
4818      if (isOutOfScope(OV))
4819        continue;
4820      // First order recurrence Phi's should typically be considered
4821      // non-uniform.
4822      auto *OP = dyn_cast<PHINode>(OV);
4823      if (OP && Legal->isFirstOrderRecurrence(OP))
4824        continue;
4825      // If all the users of the operand are uniform, then add the
4826      // operand into the uniform worklist.
4827      auto *OI = cast<Instruction>(OV);
4828      if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4829            auto *J = cast<Instruction>(U);
4830            return Worklist.count(J) ||
4831                   (OI == getLoadStorePointerOperand(J) &&
4832                    isUniformDecision(J, VF));
4833          }))
4834        addToWorklistIfAllowed(OI);
4835    }
4836  }
4837
4838  // Returns true if Ptr is the pointer operand of a memory access instruction
4839  // I, and I is known to not require scalarization.
4840  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4841    return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4842  };
4843
4844  // For an instruction to be added into Worklist above, all its users inside
4845  // the loop should also be in Worklist. However, this condition cannot be
4846  // true for phi nodes that form a cyclic dependence. We must process phi
4847  // nodes separately. An induction variable will remain uniform if all users
4848  // of the induction variable and induction variable update remain uniform.
4849  // The code below handles both pointer and non-pointer induction variables.
4850  for (auto &Induction : *Legal->getInductionVars()) {
4851    auto *Ind = Induction.first;
4852    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4853
4854    // Determine if all users of the induction variable are uniform after
4855    // vectorization.
4856    auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4857      auto *I = cast<Instruction>(U);
4858      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4859             isVectorizedMemAccessUse(I, Ind);
4860    });
4861    if (!UniformInd)
4862      continue;
4863
4864    // Determine if all users of the induction variable update instruction are
4865    // uniform after vectorization.
4866    auto UniformIndUpdate =
4867        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4868          auto *I = cast<Instruction>(U);
4869          return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4870                 isVectorizedMemAccessUse(I, IndUpdate);
4871        });
4872    if (!UniformIndUpdate)
4873      continue;
4874
4875    // The induction variable and its update instruction will remain uniform.
4876    addToWorklistIfAllowed(Ind);
4877    addToWorklistIfAllowed(IndUpdate);
4878  }
4879
4880  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4881}
4882
4883bool LoopVectorizationCostModel::runtimeChecksRequired() {
4884  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4885
4886  if (Legal->getRuntimePointerChecking()->Need) {
4887    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4888        "runtime pointer checks needed. Enable vectorization of this "
4889        "loop with '#pragma clang loop vectorize(enable)' when "
4890        "compiling with -Os/-Oz",
4891        "CantVersionLoopWithOptForSize", ORE, TheLoop);
4892    return true;
4893  }
4894
4895  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4896    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4897        "runtime SCEV checks needed. Enable vectorization of this "
4898        "loop with '#pragma clang loop vectorize(enable)' when "
4899        "compiling with -Os/-Oz",
4900        "CantVersionLoopWithOptForSize", ORE, TheLoop);
4901    return true;
4902  }
4903
4904  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4905  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4906    reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4907        "runtime stride == 1 checks needed. Enable vectorization of "
4908        "this loop with '#pragma clang loop vectorize(enable)' when "
4909        "compiling with -Os/-Oz",
4910        "CantVersionLoopWithOptForSize", ORE, TheLoop);
4911    return true;
4912  }
4913
4914  return false;
4915}
4916
4917Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4918  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4919    // TODO: It may by useful to do since it's still likely to be dynamically
4920    // uniform if the target can skip.
4921    reportVectorizationFailure(
4922        "Not inserting runtime ptr check for divergent target",
4923        "runtime pointer checks needed. Not enabled for divergent target",
4924        "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4925    return None;
4926  }
4927
4928  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4929  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4930  if (TC == 1) {
4931    reportVectorizationFailure("Single iteration (non) loop",
4932        "loop trip count is one, irrelevant for vectorization",
4933        "SingleIterationLoop", ORE, TheLoop);
4934    return None;
4935  }
4936
4937  switch (ScalarEpilogueStatus) {
4938  case CM_ScalarEpilogueAllowed:
4939    return computeFeasibleMaxVF(TC);
4940  case CM_ScalarEpilogueNotNeededUsePredicate:
4941    LLVM_DEBUG(
4942        dbgs() << "LV: vector predicate hint/switch found.\n"
4943               << "LV: Not allowing scalar epilogue, creating predicated "
4944               << "vector loop.\n");
4945    break;
4946  case CM_ScalarEpilogueNotAllowedLowTripLoop:
4947    // fallthrough as a special case of OptForSize
4948  case CM_ScalarEpilogueNotAllowedOptSize:
4949    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4950      LLVM_DEBUG(
4951          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4952    else
4953      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4954                        << "count.\n");
4955
4956    // Bail if runtime checks are required, which are not good when optimising
4957    // for size.
4958    if (runtimeChecksRequired())
4959      return None;
4960    break;
4961  }
4962
4963  // Now try the tail folding
4964
4965  // Invalidate interleave groups that require an epilogue if we can't mask
4966  // the interleave-group.
4967  if (!useMaskedInterleavedAccesses(TTI))
4968    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4969
4970  unsigned MaxVF = computeFeasibleMaxVF(TC);
4971  if (TC > 0 && TC % MaxVF == 0) {
4972    // Accept MaxVF if we do not have a tail.
4973    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4974    return MaxVF;
4975  }
4976
4977  // If we don't know the precise trip count, or if the trip count that we
4978  // found modulo the vectorization factor is not zero, try to fold the tail
4979  // by masking.
4980  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4981  if (Legal->prepareToFoldTailByMasking()) {
4982    FoldTailByMasking = true;
4983    return MaxVF;
4984  }
4985
4986  if (TC == 0) {
4987    reportVectorizationFailure(
4988        "Unable to calculate the loop count due to complex control flow",
4989        "unable to calculate the loop count due to complex control flow",
4990        "UnknownLoopCountComplexCFG", ORE, TheLoop);
4991    return None;
4992  }
4993
4994  reportVectorizationFailure(
4995      "Cannot optimize for size and vectorize at the same time.",
4996      "cannot optimize for size and vectorize at the same time. "
4997      "Enable vectorization of this loop with '#pragma clang loop "
4998      "vectorize(enable)' when compiling with -Os/-Oz",
4999      "NoTailLoopWithOptForSize", ORE, TheLoop);
5000  return None;
5001}
5002
5003unsigned
5004LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5005  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5006  unsigned SmallestType, WidestType;
5007  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5008  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5009
5010  // Get the maximum safe dependence distance in bits computed by LAA.
5011  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5012  // the memory accesses that is most restrictive (involved in the smallest
5013  // dependence distance).
5014  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5015
5016  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5017
5018  unsigned MaxVectorSize = WidestRegister / WidestType;
5019
5020  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5021                    << " / " << WidestType << " bits.\n");
5022  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5023                    << WidestRegister << " bits.\n");
5024
5025  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5026                                 " into one vector!");
5027  if (MaxVectorSize == 0) {
5028    LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5029    MaxVectorSize = 1;
5030    return MaxVectorSize;
5031  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5032             isPowerOf2_32(ConstTripCount)) {
5033    // We need to clamp the VF to be the ConstTripCount. There is no point in
5034    // choosing a higher viable VF as done in the loop below.
5035    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5036                      << ConstTripCount << "\n");
5037    MaxVectorSize = ConstTripCount;
5038    return MaxVectorSize;
5039  }
5040
5041  unsigned MaxVF = MaxVectorSize;
5042  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5043      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5044    // Collect all viable vectorization factors larger than the default MaxVF
5045    // (i.e. MaxVectorSize).
5046    SmallVector<unsigned, 8> VFs;
5047    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5048    for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5049      VFs.push_back(VS);
5050
5051    // For each VF calculate its register usage.
5052    auto RUs = calculateRegisterUsage(VFs);
5053
5054    // Select the largest VF which doesn't require more registers than existing
5055    // ones.
5056    for (int i = RUs.size() - 1; i >= 0; --i) {
5057      bool Selected = true;
5058      for (auto& pair : RUs[i].MaxLocalUsers) {
5059        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5060        if (pair.second > TargetNumRegisters)
5061          Selected = false;
5062      }
5063      if (Selected) {
5064        MaxVF = VFs[i];
5065        break;
5066      }
5067    }
5068    if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5069      if (MaxVF < MinVF) {
5070        LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5071                          << ") with target's minimum: " << MinVF << '\n');
5072        MaxVF = MinVF;
5073      }
5074    }
5075  }
5076  return MaxVF;
5077}
5078
5079VectorizationFactor
5080LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5081  float Cost = expectedCost(1).first;
5082  const float ScalarCost = Cost;
5083  unsigned Width = 1;
5084  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5085
5086  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5087  if (ForceVectorization && MaxVF > 1) {
5088    // Ignore scalar width, because the user explicitly wants vectorization.
5089    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5090    // evaluation.
5091    Cost = std::numeric_limits<float>::max();
5092  }
5093
5094  for (unsigned i = 2; i <= MaxVF; i *= 2) {
5095    // Notice that the vector loop needs to be executed less times, so
5096    // we need to divide the cost of the vector loops by the width of
5097    // the vector elements.
5098    VectorizationCostTy C = expectedCost(i);
5099    float VectorCost = C.first / (float)i;
5100    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5101                      << " costs: " << (int)VectorCost << ".\n");
5102    if (!C.second && !ForceVectorization) {
5103      LLVM_DEBUG(
5104          dbgs() << "LV: Not considering vector loop of width " << i
5105                 << " because it will not generate any vector instructions.\n");
5106      continue;
5107    }
5108    if (VectorCost < Cost) {
5109      Cost = VectorCost;
5110      Width = i;
5111    }
5112  }
5113
5114  if (!EnableCondStoresVectorization && NumPredStores) {
5115    reportVectorizationFailure("There are conditional stores.",
5116        "store that is conditionally executed prevents vectorization",
5117        "ConditionalStore", ORE, TheLoop);
5118    Width = 1;
5119    Cost = ScalarCost;
5120  }
5121
5122  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5123             << "LV: Vectorization seems to be not beneficial, "
5124             << "but was forced by a user.\n");
5125  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5126  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5127  return Factor;
5128}
5129
5130std::pair<unsigned, unsigned>
5131LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5132  unsigned MinWidth = -1U;
5133  unsigned MaxWidth = 8;
5134  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5135
5136  // For each block.
5137  for (BasicBlock *BB : TheLoop->blocks()) {
5138    // For each instruction in the loop.
5139    for (Instruction &I : BB->instructionsWithoutDebug()) {
5140      Type *T = I.getType();
5141
5142      // Skip ignored values.
5143      if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5144        continue;
5145
5146      // Only examine Loads, Stores and PHINodes.
5147      if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5148        continue;
5149
5150      // Examine PHI nodes that are reduction variables. Update the type to
5151      // account for the recurrence type.
5152      if (auto *PN = dyn_cast<PHINode>(&I)) {
5153        if (!Legal->isReductionVariable(PN))
5154          continue;
5155        RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5156        T = RdxDesc.getRecurrenceType();
5157      }
5158
5159      // Examine the stored values.
5160      if (auto *ST = dyn_cast<StoreInst>(&I))
5161        T = ST->getValueOperand()->getType();
5162
5163      // Ignore loaded pointer types and stored pointer types that are not
5164      // vectorizable.
5165      //
5166      // FIXME: The check here attempts to predict whether a load or store will
5167      //        be vectorized. We only know this for certain after a VF has
5168      //        been selected. Here, we assume that if an access can be
5169      //        vectorized, it will be. We should also look at extending this
5170      //        optimization to non-pointer types.
5171      //
5172      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5173          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5174        continue;
5175
5176      MinWidth = std::min(MinWidth,
5177                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5178      MaxWidth = std::max(MaxWidth,
5179                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5180    }
5181  }
5182
5183  return {MinWidth, MaxWidth};
5184}
5185
5186unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5187                                                           unsigned LoopCost) {
5188  // -- The interleave heuristics --
5189  // We interleave the loop in order to expose ILP and reduce the loop overhead.
5190  // There are many micro-architectural considerations that we can't predict
5191  // at this level. For example, frontend pressure (on decode or fetch) due to
5192  // code size, or the number and capabilities of the execution ports.
5193  //
5194  // We use the following heuristics to select the interleave count:
5195  // 1. If the code has reductions, then we interleave to break the cross
5196  // iteration dependency.
5197  // 2. If the loop is really small, then we interleave to reduce the loop
5198  // overhead.
5199  // 3. We don't interleave if we think that we will spill registers to memory
5200  // due to the increased register pressure.
5201
5202  if (!isScalarEpilogueAllowed())
5203    return 1;
5204
5205  // We used the distance for the interleave count.
5206  if (Legal->getMaxSafeDepDistBytes() != -1U)
5207    return 1;
5208
5209  // Do not interleave loops with a relatively small known or estimated trip
5210  // count.
5211  auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5212  if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5213    return 1;
5214
5215  RegisterUsage R = calculateRegisterUsage({VF})[0];
5216  // We divide by these constants so assume that we have at least one
5217  // instruction that uses at least one register.
5218  for (auto& pair : R.MaxLocalUsers) {
5219    pair.second = std::max(pair.second, 1U);
5220  }
5221
5222  // We calculate the interleave count using the following formula.
5223  // Subtract the number of loop invariants from the number of available
5224  // registers. These registers are used by all of the interleaved instances.
5225  // Next, divide the remaining registers by the number of registers that is
5226  // required by the loop, in order to estimate how many parallel instances
5227  // fit without causing spills. All of this is rounded down if necessary to be
5228  // a power of two. We want power of two interleave count to simplify any
5229  // addressing operations or alignment considerations.
5230  // We also want power of two interleave counts to ensure that the induction
5231  // variable of the vector loop wraps to zero, when tail is folded by masking;
5232  // this currently happens when OptForSize, in which case IC is set to 1 above.
5233  unsigned IC = UINT_MAX;
5234
5235  for (auto& pair : R.MaxLocalUsers) {
5236    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5237    LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5238                      << " registers of "
5239                      << TTI.getRegisterClassName(pair.first) << " register class\n");
5240    if (VF == 1) {
5241      if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5242        TargetNumRegisters = ForceTargetNumScalarRegs;
5243    } else {
5244      if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5245        TargetNumRegisters = ForceTargetNumVectorRegs;
5246    }
5247    unsigned MaxLocalUsers = pair.second;
5248    unsigned LoopInvariantRegs = 0;
5249    if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5250      LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5251
5252    unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5253    // Don't count the induction variable as interleaved.
5254    if (EnableIndVarRegisterHeur) {
5255      TmpIC =
5256          PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5257                        std::max(1U, (MaxLocalUsers - 1)));
5258    }
5259
5260    IC = std::min(IC, TmpIC);
5261  }
5262
5263  // Clamp the interleave ranges to reasonable counts.
5264  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5265
5266  // Check if the user has overridden the max.
5267  if (VF == 1) {
5268    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5269      MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5270  } else {
5271    if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5272      MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5273  }
5274
5275  // If trip count is known or estimated compile time constant, limit the
5276  // interleave count to be less than the trip count divided by VF.
5277  if (BestKnownTC) {
5278    MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5279  }
5280
5281  // If we did not calculate the cost for VF (because the user selected the VF)
5282  // then we calculate the cost of VF here.
5283  if (LoopCost == 0)
5284    LoopCost = expectedCost(VF).first;
5285
5286  assert(LoopCost && "Non-zero loop cost expected");
5287
5288  // Clamp the calculated IC to be between the 1 and the max interleave count
5289  // that the target and trip count allows.
5290  if (IC > MaxInterleaveCount)
5291    IC = MaxInterleaveCount;
5292  else if (IC < 1)
5293    IC = 1;
5294
5295  // Interleave if we vectorized this loop and there is a reduction that could
5296  // benefit from interleaving.
5297  if (VF > 1 && !Legal->getReductionVars()->empty()) {
5298    LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5299    return IC;
5300  }
5301
5302  // Note that if we've already vectorized the loop we will have done the
5303  // runtime check and so interleaving won't require further checks.
5304  bool InterleavingRequiresRuntimePointerCheck =
5305      (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5306
5307  // We want to interleave small loops in order to reduce the loop overhead and
5308  // potentially expose ILP opportunities.
5309  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5310  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5311    // We assume that the cost overhead is 1 and we use the cost model
5312    // to estimate the cost of the loop and interleave until the cost of the
5313    // loop overhead is about 5% of the cost of the loop.
5314    unsigned SmallIC =
5315        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5316
5317    // Interleave until store/load ports (estimated by max interleave count) are
5318    // saturated.
5319    unsigned NumStores = Legal->getNumStores();
5320    unsigned NumLoads = Legal->getNumLoads();
5321    unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5322    unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5323
5324    // If we have a scalar reduction (vector reductions are already dealt with
5325    // by this point), we can increase the critical path length if the loop
5326    // we're interleaving is inside another loop. Limit, by default to 2, so the
5327    // critical path only gets increased by one reduction operation.
5328    if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5329      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5330      SmallIC = std::min(SmallIC, F);
5331      StoresIC = std::min(StoresIC, F);
5332      LoadsIC = std::min(LoadsIC, F);
5333    }
5334
5335    if (EnableLoadStoreRuntimeInterleave &&
5336        std::max(StoresIC, LoadsIC) > SmallIC) {
5337      LLVM_DEBUG(
5338          dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5339      return std::max(StoresIC, LoadsIC);
5340    }
5341
5342    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5343    return SmallIC;
5344  }
5345
5346  // Interleave if this is a large loop (small loops are already dealt with by
5347  // this point) that could benefit from interleaving.
5348  bool HasReductions = !Legal->getReductionVars()->empty();
5349  if (TTI.enableAggressiveInterleaving(HasReductions)) {
5350    LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5351    return IC;
5352  }
5353
5354  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5355  return 1;
5356}
5357
5358SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5359LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5360  // This function calculates the register usage by measuring the highest number
5361  // of values that are alive at a single location. Obviously, this is a very
5362  // rough estimation. We scan the loop in a topological order in order and
5363  // assign a number to each instruction. We use RPO to ensure that defs are
5364  // met before their users. We assume that each instruction that has in-loop
5365  // users starts an interval. We record every time that an in-loop value is
5366  // used, so we have a list of the first and last occurrences of each
5367  // instruction. Next, we transpose this data structure into a multi map that
5368  // holds the list of intervals that *end* at a specific location. This multi
5369  // map allows us to perform a linear search. We scan the instructions linearly
5370  // and record each time that a new interval starts, by placing it in a set.
5371  // If we find this value in the multi-map then we remove it from the set.
5372  // The max register usage is the maximum size of the set.
5373  // We also search for instructions that are defined outside the loop, but are
5374  // used inside the loop. We need this number separately from the max-interval
5375  // usage number because when we unroll, loop-invariant values do not take
5376  // more register.
5377  LoopBlocksDFS DFS(TheLoop);
5378  DFS.perform(LI);
5379
5380  RegisterUsage RU;
5381
5382  // Each 'key' in the map opens a new interval. The values
5383  // of the map are the index of the 'last seen' usage of the
5384  // instruction that is the key.
5385  using IntervalMap = DenseMap<Instruction *, unsigned>;
5386
5387  // Maps instruction to its index.
5388  SmallVector<Instruction *, 64> IdxToInstr;
5389  // Marks the end of each interval.
5390  IntervalMap EndPoint;
5391  // Saves the list of instruction indices that are used in the loop.
5392  SmallPtrSet<Instruction *, 8> Ends;
5393  // Saves the list of values that are used in the loop but are
5394  // defined outside the loop, such as arguments and constants.
5395  SmallPtrSet<Value *, 8> LoopInvariants;
5396
5397  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5398    for (Instruction &I : BB->instructionsWithoutDebug()) {
5399      IdxToInstr.push_back(&I);
5400
5401      // Save the end location of each USE.
5402      for (Value *U : I.operands()) {
5403        auto *Instr = dyn_cast<Instruction>(U);
5404
5405        // Ignore non-instruction values such as arguments, constants, etc.
5406        if (!Instr)
5407          continue;
5408
5409        // If this instruction is outside the loop then record it and continue.
5410        if (!TheLoop->contains(Instr)) {
5411          LoopInvariants.insert(Instr);
5412          continue;
5413        }
5414
5415        // Overwrite previous end points.
5416        EndPoint[Instr] = IdxToInstr.size();
5417        Ends.insert(Instr);
5418      }
5419    }
5420  }
5421
5422  // Saves the list of intervals that end with the index in 'key'.
5423  using InstrList = SmallVector<Instruction *, 2>;
5424  DenseMap<unsigned, InstrList> TransposeEnds;
5425
5426  // Transpose the EndPoints to a list of values that end at each index.
5427  for (auto &Interval : EndPoint)
5428    TransposeEnds[Interval.second].push_back(Interval.first);
5429
5430  SmallPtrSet<Instruction *, 8> OpenIntervals;
5431
5432  // Get the size of the widest register.
5433  unsigned MaxSafeDepDist = -1U;
5434  if (Legal->getMaxSafeDepDistBytes() != -1U)
5435    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5436  unsigned WidestRegister =
5437      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5438  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5439
5440  SmallVector<RegisterUsage, 8> RUs(VFs.size());
5441  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5442
5443  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5444
5445  // A lambda that gets the register usage for the given type and VF.
5446  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5447    if (Ty->isTokenTy())
5448      return 0U;
5449    unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5450    return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5451  };
5452
5453  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5454    Instruction *I = IdxToInstr[i];
5455
5456    // Remove all of the instructions that end at this location.
5457    InstrList &List = TransposeEnds[i];
5458    for (Instruction *ToRemove : List)
5459      OpenIntervals.erase(ToRemove);
5460
5461    // Ignore instructions that are never used within the loop.
5462    if (Ends.find(I) == Ends.end())
5463      continue;
5464
5465    // Skip ignored values.
5466    if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5467      continue;
5468
5469    // For each VF find the maximum usage of registers.
5470    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5471      // Count the number of live intervals.
5472      SmallMapVector<unsigned, unsigned, 4> RegUsage;
5473
5474      if (VFs[j] == 1) {
5475        for (auto Inst : OpenIntervals) {
5476          unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5477          if (RegUsage.find(ClassID) == RegUsage.end())
5478            RegUsage[ClassID] = 1;
5479          else
5480            RegUsage[ClassID] += 1;
5481        }
5482      } else {
5483        collectUniformsAndScalars(VFs[j]);
5484        for (auto Inst : OpenIntervals) {
5485          // Skip ignored values for VF > 1.
5486          if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5487            continue;
5488          if (isScalarAfterVectorization(Inst, VFs[j])) {
5489            unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5490            if (RegUsage.find(ClassID) == RegUsage.end())
5491              RegUsage[ClassID] = 1;
5492            else
5493              RegUsage[ClassID] += 1;
5494          } else {
5495            unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5496            if (RegUsage.find(ClassID) == RegUsage.end())
5497              RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5498            else
5499              RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5500          }
5501        }
5502      }
5503
5504      for (auto& pair : RegUsage) {
5505        if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5506          MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5507        else
5508          MaxUsages[j][pair.first] = pair.second;
5509      }
5510    }
5511
5512    LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5513                      << OpenIntervals.size() << '\n');
5514
5515    // Add the current instruction to the list of open intervals.
5516    OpenIntervals.insert(I);
5517  }
5518
5519  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5520    SmallMapVector<unsigned, unsigned, 4> Invariant;
5521
5522    for (auto Inst : LoopInvariants) {
5523      unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5524      unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5525      if (Invariant.find(ClassID) == Invariant.end())
5526        Invariant[ClassID] = Usage;
5527      else
5528        Invariant[ClassID] += Usage;
5529    }
5530
5531    LLVM_DEBUG({
5532      dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5533      dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5534             << " item\n";
5535      for (const auto &pair : MaxUsages[i]) {
5536        dbgs() << "LV(REG): RegisterClass: "
5537               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5538               << " registers\n";
5539      }
5540      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5541             << " item\n";
5542      for (const auto &pair : Invariant) {
5543        dbgs() << "LV(REG): RegisterClass: "
5544               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5545               << " registers\n";
5546      }
5547    });
5548
5549    RU.LoopInvariantRegs = Invariant;
5550    RU.MaxLocalUsers = MaxUsages[i];
5551    RUs[i] = RU;
5552  }
5553
5554  return RUs;
5555}
5556
5557bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5558  // TODO: Cost model for emulated masked load/store is completely
5559  // broken. This hack guides the cost model to use an artificially
5560  // high enough value to practically disable vectorization with such
5561  // operations, except where previously deployed legality hack allowed
5562  // using very low cost values. This is to avoid regressions coming simply
5563  // from moving "masked load/store" check from legality to cost model.
5564  // Masked Load/Gather emulation was previously never allowed.
5565  // Limited number of Masked Store/Scatter emulation was allowed.
5566  assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5567  return isa<LoadInst>(I) ||
5568         (isa<StoreInst>(I) &&
5569          NumPredStores > NumberOfStoresToPredicate);
5570}
5571
5572void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5573  // If we aren't vectorizing the loop, or if we've already collected the
5574  // instructions to scalarize, there's nothing to do. Collection may already
5575  // have occurred if we have a user-selected VF and are now computing the
5576  // expected cost for interleaving.
5577  if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5578    return;
5579
5580  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5581  // not profitable to scalarize any instructions, the presence of VF in the
5582  // map will indicate that we've analyzed it already.
5583  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5584
5585  // Find all the instructions that are scalar with predication in the loop and
5586  // determine if it would be better to not if-convert the blocks they are in.
5587  // If so, we also record the instructions to scalarize.
5588  for (BasicBlock *BB : TheLoop->blocks()) {
5589    if (!blockNeedsPredication(BB))
5590      continue;
5591    for (Instruction &I : *BB)
5592      if (isScalarWithPredication(&I)) {
5593        ScalarCostsTy ScalarCosts;
5594        // Do not apply discount logic if hacked cost is needed
5595        // for emulated masked memrefs.
5596        if (!useEmulatedMaskMemRefHack(&I) &&
5597            computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5598          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5599        // Remember that BB will remain after vectorization.
5600        PredicatedBBsAfterVectorization.insert(BB);
5601      }
5602  }
5603}
5604
5605int LoopVectorizationCostModel::computePredInstDiscount(
5606    Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5607    unsigned VF) {
5608  assert(!isUniformAfterVectorization(PredInst, VF) &&
5609         "Instruction marked uniform-after-vectorization will be predicated");
5610
5611  // Initialize the discount to zero, meaning that the scalar version and the
5612  // vector version cost the same.
5613  int Discount = 0;
5614
5615  // Holds instructions to analyze. The instructions we visit are mapped in
5616  // ScalarCosts. Those instructions are the ones that would be scalarized if
5617  // we find that the scalar version costs less.
5618  SmallVector<Instruction *, 8> Worklist;
5619
5620  // Returns true if the given instruction can be scalarized.
5621  auto canBeScalarized = [&](Instruction *I) -> bool {
5622    // We only attempt to scalarize instructions forming a single-use chain
5623    // from the original predicated block that would otherwise be vectorized.
5624    // Although not strictly necessary, we give up on instructions we know will
5625    // already be scalar to avoid traversing chains that are unlikely to be
5626    // beneficial.
5627    if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5628        isScalarAfterVectorization(I, VF))
5629      return false;
5630
5631    // If the instruction is scalar with predication, it will be analyzed
5632    // separately. We ignore it within the context of PredInst.
5633    if (isScalarWithPredication(I))
5634      return false;
5635
5636    // If any of the instruction's operands are uniform after vectorization,
5637    // the instruction cannot be scalarized. This prevents, for example, a
5638    // masked load from being scalarized.
5639    //
5640    // We assume we will only emit a value for lane zero of an instruction
5641    // marked uniform after vectorization, rather than VF identical values.
5642    // Thus, if we scalarize an instruction that uses a uniform, we would
5643    // create uses of values corresponding to the lanes we aren't emitting code
5644    // for. This behavior can be changed by allowing getScalarValue to clone
5645    // the lane zero values for uniforms rather than asserting.
5646    for (Use &U : I->operands())
5647      if (auto *J = dyn_cast<Instruction>(U.get()))
5648        if (isUniformAfterVectorization(J, VF))
5649          return false;
5650
5651    // Otherwise, we can scalarize the instruction.
5652    return true;
5653  };
5654
5655  // Compute the expected cost discount from scalarizing the entire expression
5656  // feeding the predicated instruction. We currently only consider expressions
5657  // that are single-use instruction chains.
5658  Worklist.push_back(PredInst);
5659  while (!Worklist.empty()) {
5660    Instruction *I = Worklist.pop_back_val();
5661
5662    // If we've already analyzed the instruction, there's nothing to do.
5663    if (ScalarCosts.find(I) != ScalarCosts.end())
5664      continue;
5665
5666    // Compute the cost of the vector instruction. Note that this cost already
5667    // includes the scalarization overhead of the predicated instruction.
5668    unsigned VectorCost = getInstructionCost(I, VF).first;
5669
5670    // Compute the cost of the scalarized instruction. This cost is the cost of
5671    // the instruction as if it wasn't if-converted and instead remained in the
5672    // predicated block. We will scale this cost by block probability after
5673    // computing the scalarization overhead.
5674    unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5675
5676    // Compute the scalarization overhead of needed insertelement instructions
5677    // and phi nodes.
5678    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5679      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5680                                                 true, false);
5681      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5682    }
5683
5684    // Compute the scalarization overhead of needed extractelement
5685    // instructions. For each of the instruction's operands, if the operand can
5686    // be scalarized, add it to the worklist; otherwise, account for the
5687    // overhead.
5688    for (Use &U : I->operands())
5689      if (auto *J = dyn_cast<Instruction>(U.get())) {
5690        assert(VectorType::isValidElementType(J->getType()) &&
5691               "Instruction has non-scalar type");
5692        if (canBeScalarized(J))
5693          Worklist.push_back(J);
5694        else if (needsExtract(J, VF))
5695          ScalarCost += TTI.getScalarizationOverhead(
5696                              ToVectorTy(J->getType(),VF), false, true);
5697      }
5698
5699    // Scale the total scalar cost by block probability.
5700    ScalarCost /= getReciprocalPredBlockProb();
5701
5702    // Compute the discount. A non-negative discount means the vector version
5703    // of the instruction costs more, and scalarizing would be beneficial.
5704    Discount += VectorCost - ScalarCost;
5705    ScalarCosts[I] = ScalarCost;
5706  }
5707
5708  return Discount;
5709}
5710
5711LoopVectorizationCostModel::VectorizationCostTy
5712LoopVectorizationCostModel::expectedCost(unsigned VF) {
5713  VectorizationCostTy Cost;
5714
5715  // For each block.
5716  for (BasicBlock *BB : TheLoop->blocks()) {
5717    VectorizationCostTy BlockCost;
5718
5719    // For each instruction in the old loop.
5720    for (Instruction &I : BB->instructionsWithoutDebug()) {
5721      // Skip ignored values.
5722      if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5723          (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5724        continue;
5725
5726      VectorizationCostTy C = getInstructionCost(&I, VF);
5727
5728      // Check if we should override the cost.
5729      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5730        C.first = ForceTargetInstructionCost;
5731
5732      BlockCost.first += C.first;
5733      BlockCost.second |= C.second;
5734      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5735                        << " for VF " << VF << " For instruction: " << I
5736                        << '\n');
5737    }
5738
5739    // If we are vectorizing a predicated block, it will have been
5740    // if-converted. This means that the block's instructions (aside from
5741    // stores and instructions that may divide by zero) will now be
5742    // unconditionally executed. For the scalar case, we may not always execute
5743    // the predicated block. Thus, scale the block's cost by the probability of
5744    // executing it.
5745    if (VF == 1 && blockNeedsPredication(BB))
5746      BlockCost.first /= getReciprocalPredBlockProb();
5747
5748    Cost.first += BlockCost.first;
5749    Cost.second |= BlockCost.second;
5750  }
5751
5752  return Cost;
5753}
5754
5755/// Gets Address Access SCEV after verifying that the access pattern
5756/// is loop invariant except the induction variable dependence.
5757///
5758/// This SCEV can be sent to the Target in order to estimate the address
5759/// calculation cost.
5760static const SCEV *getAddressAccessSCEV(
5761              Value *Ptr,
5762              LoopVectorizationLegality *Legal,
5763              PredicatedScalarEvolution &PSE,
5764              const Loop *TheLoop) {
5765
5766  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5767  if (!Gep)
5768    return nullptr;
5769
5770  // We are looking for a gep with all loop invariant indices except for one
5771  // which should be an induction variable.
5772  auto SE = PSE.getSE();
5773  unsigned NumOperands = Gep->getNumOperands();
5774  for (unsigned i = 1; i < NumOperands; ++i) {
5775    Value *Opd = Gep->getOperand(i);
5776    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5777        !Legal->isInductionVariable(Opd))
5778      return nullptr;
5779  }
5780
5781  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5782  return PSE.getSCEV(Ptr);
5783}
5784
5785static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5786  return Legal->hasStride(I->getOperand(0)) ||
5787         Legal->hasStride(I->getOperand(1));
5788}
5789
5790unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5791                                                                 unsigned VF) {
5792  assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5793  Type *ValTy = getMemInstValueType(I);
5794  auto SE = PSE.getSE();
5795
5796  unsigned AS = getLoadStoreAddressSpace(I);
5797  Value *Ptr = getLoadStorePointerOperand(I);
5798  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5799
5800  // Figure out whether the access is strided and get the stride value
5801  // if it's known in compile time
5802  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5803
5804  // Get the cost of the scalar memory instruction and address computation.
5805  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5806
5807  // Don't pass *I here, since it is scalar but will actually be part of a
5808  // vectorized loop where the user of it is a vectorized instruction.
5809  const MaybeAlign Alignment = getLoadStoreAlignment(I);
5810  Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5811                                   Alignment, AS);
5812
5813  // Get the overhead of the extractelement and insertelement instructions
5814  // we might create due to scalarization.
5815  Cost += getScalarizationOverhead(I, VF);
5816
5817  // If we have a predicated store, it may not be executed for each vector
5818  // lane. Scale the cost by the probability of executing the predicated
5819  // block.
5820  if (isPredicatedInst(I)) {
5821    Cost /= getReciprocalPredBlockProb();
5822
5823    if (useEmulatedMaskMemRefHack(I))
5824      // Artificially setting to a high enough value to practically disable
5825      // vectorization with such operations.
5826      Cost = 3000000;
5827  }
5828
5829  return Cost;
5830}
5831
5832unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5833                                                             unsigned VF) {
5834  Type *ValTy = getMemInstValueType(I);
5835  Type *VectorTy = ToVectorTy(ValTy, VF);
5836  Value *Ptr = getLoadStorePointerOperand(I);
5837  unsigned AS = getLoadStoreAddressSpace(I);
5838  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5839
5840  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5841         "Stride should be 1 or -1 for consecutive memory access");
5842  const MaybeAlign Alignment = getLoadStoreAlignment(I);
5843  unsigned Cost = 0;
5844  if (Legal->isMaskRequired(I))
5845    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5846                                      Alignment ? Alignment->value() : 0, AS);
5847  else
5848    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5849
5850  bool Reverse = ConsecutiveStride < 0;
5851  if (Reverse)
5852    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5853  return Cost;
5854}
5855
5856unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5857                                                         unsigned VF) {
5858  Type *ValTy = getMemInstValueType(I);
5859  Type *VectorTy = ToVectorTy(ValTy, VF);
5860  const MaybeAlign Alignment = getLoadStoreAlignment(I);
5861  unsigned AS = getLoadStoreAddressSpace(I);
5862  if (isa<LoadInst>(I)) {
5863    return TTI.getAddressComputationCost(ValTy) +
5864           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5865           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5866  }
5867  StoreInst *SI = cast<StoreInst>(I);
5868
5869  bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5870  return TTI.getAddressComputationCost(ValTy) +
5871         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5872         (isLoopInvariantStoreValue
5873              ? 0
5874              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5875                                       VF - 1));
5876}
5877
5878unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5879                                                          unsigned VF) {
5880  Type *ValTy = getMemInstValueType(I);
5881  Type *VectorTy = ToVectorTy(ValTy, VF);
5882  const MaybeAlign Alignment = getLoadStoreAlignment(I);
5883  Value *Ptr = getLoadStorePointerOperand(I);
5884
5885  return TTI.getAddressComputationCost(VectorTy) +
5886         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5887                                    Legal->isMaskRequired(I),
5888                                    Alignment ? Alignment->value() : 0);
5889}
5890
5891unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5892                                                            unsigned VF) {
5893  Type *ValTy = getMemInstValueType(I);
5894  Type *VectorTy = ToVectorTy(ValTy, VF);
5895  unsigned AS = getLoadStoreAddressSpace(I);
5896
5897  auto Group = getInterleavedAccessGroup(I);
5898  assert(Group && "Fail to get an interleaved access group.");
5899
5900  unsigned InterleaveFactor = Group->getFactor();
5901  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5902
5903  // Holds the indices of existing members in an interleaved load group.
5904  // An interleaved store group doesn't need this as it doesn't allow gaps.
5905  SmallVector<unsigned, 4> Indices;
5906  if (isa<LoadInst>(I)) {
5907    for (unsigned i = 0; i < InterleaveFactor; i++)
5908      if (Group->getMember(i))
5909        Indices.push_back(i);
5910  }
5911
5912  // Calculate the cost of the whole interleaved group.
5913  bool UseMaskForGaps =
5914      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5915  unsigned Cost = TTI.getInterleavedMemoryOpCost(
5916      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5917      Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5918
5919  if (Group->isReverse()) {
5920    // TODO: Add support for reversed masked interleaved access.
5921    assert(!Legal->isMaskRequired(I) &&
5922           "Reverse masked interleaved access not supported.");
5923    Cost += Group->getNumMembers() *
5924            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5925  }
5926  return Cost;
5927}
5928
5929unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5930                                                              unsigned VF) {
5931  // Calculate scalar cost only. Vectorization cost should be ready at this
5932  // moment.
5933  if (VF == 1) {
5934    Type *ValTy = getMemInstValueType(I);
5935    const MaybeAlign Alignment = getLoadStoreAlignment(I);
5936    unsigned AS = getLoadStoreAddressSpace(I);
5937
5938    return TTI.getAddressComputationCost(ValTy) +
5939           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5940  }
5941  return getWideningCost(I, VF);
5942}
5943
5944LoopVectorizationCostModel::VectorizationCostTy
5945LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5946  // If we know that this instruction will remain uniform, check the cost of
5947  // the scalar version.
5948  if (isUniformAfterVectorization(I, VF))
5949    VF = 1;
5950
5951  if (VF > 1 && isProfitableToScalarize(I, VF))
5952    return VectorizationCostTy(InstsToScalarize[VF][I], false);
5953
5954  // Forced scalars do not have any scalarization overhead.
5955  auto ForcedScalar = ForcedScalars.find(VF);
5956  if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5957    auto InstSet = ForcedScalar->second;
5958    if (InstSet.find(I) != InstSet.end())
5959      return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5960  }
5961
5962  Type *VectorTy;
5963  unsigned C = getInstructionCost(I, VF, VectorTy);
5964
5965  bool TypeNotScalarized =
5966      VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5967  return VectorizationCostTy(C, TypeNotScalarized);
5968}
5969
5970unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5971                                                              unsigned VF) {
5972
5973  if (VF == 1)
5974    return 0;
5975
5976  unsigned Cost = 0;
5977  Type *RetTy = ToVectorTy(I->getType(), VF);
5978  if (!RetTy->isVoidTy() &&
5979      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5980    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5981
5982  // Some targets keep addresses scalar.
5983  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5984    return Cost;
5985
5986  // Some targets support efficient element stores.
5987  if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5988    return Cost;
5989
5990  // Collect operands to consider.
5991  CallInst *CI = dyn_cast<CallInst>(I);
5992  Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5993
5994  // Skip operands that do not require extraction/scalarization and do not incur
5995  // any overhead.
5996  return Cost + TTI.getOperandsScalarizationOverhead(
5997                    filterExtractingOperands(Ops, VF), VF);
5998}
5999
6000void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6001  if (VF == 1)
6002    return;
6003  NumPredStores = 0;
6004  for (BasicBlock *BB : TheLoop->blocks()) {
6005    // For each instruction in the old loop.
6006    for (Instruction &I : *BB) {
6007      Value *Ptr =  getLoadStorePointerOperand(&I);
6008      if (!Ptr)
6009        continue;
6010
6011      // TODO: We should generate better code and update the cost model for
6012      // predicated uniform stores. Today they are treated as any other
6013      // predicated store (see added test cases in
6014      // invariant-store-vectorization.ll).
6015      if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6016        NumPredStores++;
6017
6018      if (Legal->isUniform(Ptr) &&
6019          // Conditional loads and stores should be scalarized and predicated.
6020          // isScalarWithPredication cannot be used here since masked
6021          // gather/scatters are not considered scalar with predication.
6022          !Legal->blockNeedsPredication(I.getParent())) {
6023        // TODO: Avoid replicating loads and stores instead of
6024        // relying on instcombine to remove them.
6025        // Load: Scalar load + broadcast
6026        // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6027        unsigned Cost = getUniformMemOpCost(&I, VF);
6028        setWideningDecision(&I, VF, CM_Scalarize, Cost);
6029        continue;
6030      }
6031
6032      // We assume that widening is the best solution when possible.
6033      if (memoryInstructionCanBeWidened(&I, VF)) {
6034        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6035        int ConsecutiveStride =
6036               Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6037        assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6038               "Expected consecutive stride.");
6039        InstWidening Decision =
6040            ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6041        setWideningDecision(&I, VF, Decision, Cost);
6042        continue;
6043      }
6044
6045      // Choose between Interleaving, Gather/Scatter or Scalarization.
6046      unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6047      unsigned NumAccesses = 1;
6048      if (isAccessInterleaved(&I)) {
6049        auto Group = getInterleavedAccessGroup(&I);
6050        assert(Group && "Fail to get an interleaved access group.");
6051
6052        // Make one decision for the whole group.
6053        if (getWideningDecision(&I, VF) != CM_Unknown)
6054          continue;
6055
6056        NumAccesses = Group->getNumMembers();
6057        if (interleavedAccessCanBeWidened(&I, VF))
6058          InterleaveCost = getInterleaveGroupCost(&I, VF);
6059      }
6060
6061      unsigned GatherScatterCost =
6062          isLegalGatherOrScatter(&I)
6063              ? getGatherScatterCost(&I, VF) * NumAccesses
6064              : std::numeric_limits<unsigned>::max();
6065
6066      unsigned ScalarizationCost =
6067          getMemInstScalarizationCost(&I, VF) * NumAccesses;
6068
6069      // Choose better solution for the current VF,
6070      // write down this decision and use it during vectorization.
6071      unsigned Cost;
6072      InstWidening Decision;
6073      if (InterleaveCost <= GatherScatterCost &&
6074          InterleaveCost < ScalarizationCost) {
6075        Decision = CM_Interleave;
6076        Cost = InterleaveCost;
6077      } else if (GatherScatterCost < ScalarizationCost) {
6078        Decision = CM_GatherScatter;
6079        Cost = GatherScatterCost;
6080      } else {
6081        Decision = CM_Scalarize;
6082        Cost = ScalarizationCost;
6083      }
6084      // If the instructions belongs to an interleave group, the whole group
6085      // receives the same decision. The whole group receives the cost, but
6086      // the cost will actually be assigned to one instruction.
6087      if (auto Group = getInterleavedAccessGroup(&I))
6088        setWideningDecision(Group, VF, Decision, Cost);
6089      else
6090        setWideningDecision(&I, VF, Decision, Cost);
6091    }
6092  }
6093
6094  // Make sure that any load of address and any other address computation
6095  // remains scalar unless there is gather/scatter support. This avoids
6096  // inevitable extracts into address registers, and also has the benefit of
6097  // activating LSR more, since that pass can't optimize vectorized
6098  // addresses.
6099  if (TTI.prefersVectorizedAddressing())
6100    return;
6101
6102  // Start with all scalar pointer uses.
6103  SmallPtrSet<Instruction *, 8> AddrDefs;
6104  for (BasicBlock *BB : TheLoop->blocks())
6105    for (Instruction &I : *BB) {
6106      Instruction *PtrDef =
6107        dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6108      if (PtrDef && TheLoop->contains(PtrDef) &&
6109          getWideningDecision(&I, VF) != CM_GatherScatter)
6110        AddrDefs.insert(PtrDef);
6111    }
6112
6113  // Add all instructions used to generate the addresses.
6114  SmallVector<Instruction *, 4> Worklist;
6115  for (auto *I : AddrDefs)
6116    Worklist.push_back(I);
6117  while (!Worklist.empty()) {
6118    Instruction *I = Worklist.pop_back_val();
6119    for (auto &Op : I->operands())
6120      if (auto *InstOp = dyn_cast<Instruction>(Op))
6121        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6122            AddrDefs.insert(InstOp).second)
6123          Worklist.push_back(InstOp);
6124  }
6125
6126  for (auto *I : AddrDefs) {
6127    if (isa<LoadInst>(I)) {
6128      // Setting the desired widening decision should ideally be handled in
6129      // by cost functions, but since this involves the task of finding out
6130      // if the loaded register is involved in an address computation, it is
6131      // instead changed here when we know this is the case.
6132      InstWidening Decision = getWideningDecision(I, VF);
6133      if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6134        // Scalarize a widened load of address.
6135        setWideningDecision(I, VF, CM_Scalarize,
6136                            (VF * getMemoryInstructionCost(I, 1)));
6137      else if (auto Group = getInterleavedAccessGroup(I)) {
6138        // Scalarize an interleave group of address loads.
6139        for (unsigned I = 0; I < Group->getFactor(); ++I) {
6140          if (Instruction *Member = Group->getMember(I))
6141            setWideningDecision(Member, VF, CM_Scalarize,
6142                                (VF * getMemoryInstructionCost(Member, 1)));
6143        }
6144      }
6145    } else
6146      // Make sure I gets scalarized and a cost estimate without
6147      // scalarization overhead.
6148      ForcedScalars[VF].insert(I);
6149  }
6150}
6151
6152unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6153                                                        unsigned VF,
6154                                                        Type *&VectorTy) {
6155  Type *RetTy = I->getType();
6156  if (canTruncateToMinimalBitwidth(I, VF))
6157    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6158  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6159  auto SE = PSE.getSE();
6160
6161  // TODO: We need to estimate the cost of intrinsic calls.
6162  switch (I->getOpcode()) {
6163  case Instruction::GetElementPtr:
6164    // We mark this instruction as zero-cost because the cost of GEPs in
6165    // vectorized code depends on whether the corresponding memory instruction
6166    // is scalarized or not. Therefore, we handle GEPs with the memory
6167    // instruction cost.
6168    return 0;
6169  case Instruction::Br: {
6170    // In cases of scalarized and predicated instructions, there will be VF
6171    // predicated blocks in the vectorized loop. Each branch around these
6172    // blocks requires also an extract of its vector compare i1 element.
6173    bool ScalarPredicatedBB = false;
6174    BranchInst *BI = cast<BranchInst>(I);
6175    if (VF > 1 && BI->isConditional() &&
6176        (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6177             PredicatedBBsAfterVectorization.end() ||
6178         PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6179             PredicatedBBsAfterVectorization.end()))
6180      ScalarPredicatedBB = true;
6181
6182    if (ScalarPredicatedBB) {
6183      // Return cost for branches around scalarized and predicated blocks.
6184      Type *Vec_i1Ty =
6185          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6186      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6187              (TTI.getCFInstrCost(Instruction::Br) * VF));
6188    } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6189      // The back-edge branch will remain, as will all scalar branches.
6190      return TTI.getCFInstrCost(Instruction::Br);
6191    else
6192      // This branch will be eliminated by if-conversion.
6193      return 0;
6194    // Note: We currently assume zero cost for an unconditional branch inside
6195    // a predicated block since it will become a fall-through, although we
6196    // may decide in the future to call TTI for all branches.
6197  }
6198  case Instruction::PHI: {
6199    auto *Phi = cast<PHINode>(I);
6200
6201    // First-order recurrences are replaced by vector shuffles inside the loop.
6202    // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6203    if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6204      return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6205                                VectorTy, VF - 1, VectorType::get(RetTy, 1));
6206
6207    // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6208    // converted into select instructions. We require N - 1 selects per phi
6209    // node, where N is the number of incoming values.
6210    if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6211      return (Phi->getNumIncomingValues() - 1) *
6212             TTI.getCmpSelInstrCost(
6213                 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6214                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6215
6216    return TTI.getCFInstrCost(Instruction::PHI);
6217  }
6218  case Instruction::UDiv:
6219  case Instruction::SDiv:
6220  case Instruction::URem:
6221  case Instruction::SRem:
6222    // If we have a predicated instruction, it may not be executed for each
6223    // vector lane. Get the scalarization cost and scale this amount by the
6224    // probability of executing the predicated block. If the instruction is not
6225    // predicated, we fall through to the next case.
6226    if (VF > 1 && isScalarWithPredication(I)) {
6227      unsigned Cost = 0;
6228
6229      // These instructions have a non-void type, so account for the phi nodes
6230      // that we will create. This cost is likely to be zero. The phi node
6231      // cost, if any, should be scaled by the block probability because it
6232      // models a copy at the end of each predicated block.
6233      Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6234
6235      // The cost of the non-predicated instruction.
6236      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6237
6238      // The cost of insertelement and extractelement instructions needed for
6239      // scalarization.
6240      Cost += getScalarizationOverhead(I, VF);
6241
6242      // Scale the cost by the probability of executing the predicated blocks.
6243      // This assumes the predicated block for each vector lane is equally
6244      // likely.
6245      return Cost / getReciprocalPredBlockProb();
6246    }
6247    LLVM_FALLTHROUGH;
6248  case Instruction::Add:
6249  case Instruction::FAdd:
6250  case Instruction::Sub:
6251  case Instruction::FSub:
6252  case Instruction::Mul:
6253  case Instruction::FMul:
6254  case Instruction::FDiv:
6255  case Instruction::FRem:
6256  case Instruction::Shl:
6257  case Instruction::LShr:
6258  case Instruction::AShr:
6259  case Instruction::And:
6260  case Instruction::Or:
6261  case Instruction::Xor: {
6262    // Since we will replace the stride by 1 the multiplication should go away.
6263    if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6264      return 0;
6265    // Certain instructions can be cheaper to vectorize if they have a constant
6266    // second vector operand. One example of this are shifts on x86.
6267    Value *Op2 = I->getOperand(1);
6268    TargetTransformInfo::OperandValueProperties Op2VP;
6269    TargetTransformInfo::OperandValueKind Op2VK =
6270        TTI.getOperandInfo(Op2, Op2VP);
6271    if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6272      Op2VK = TargetTransformInfo::OK_UniformValue;
6273
6274    SmallVector<const Value *, 4> Operands(I->operand_values());
6275    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6276    return N * TTI.getArithmeticInstrCost(
6277                   I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6278                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6279  }
6280  case Instruction::FNeg: {
6281    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6282    return N * TTI.getArithmeticInstrCost(
6283                   I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6284                   TargetTransformInfo::OK_AnyValue,
6285                   TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6286                   I->getOperand(0), I);
6287  }
6288  case Instruction::Select: {
6289    SelectInst *SI = cast<SelectInst>(I);
6290    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6291    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6292    Type *CondTy = SI->getCondition()->getType();
6293    if (!ScalarCond)
6294      CondTy = VectorType::get(CondTy, VF);
6295
6296    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6297  }
6298  case Instruction::ICmp:
6299  case Instruction::FCmp: {
6300    Type *ValTy = I->getOperand(0)->getType();
6301    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6302    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6303      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6304    VectorTy = ToVectorTy(ValTy, VF);
6305    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6306  }
6307  case Instruction::Store:
6308  case Instruction::Load: {
6309    unsigned Width = VF;
6310    if (Width > 1) {
6311      InstWidening Decision = getWideningDecision(I, Width);
6312      assert(Decision != CM_Unknown &&
6313             "CM decision should be taken at this point");
6314      if (Decision == CM_Scalarize)
6315        Width = 1;
6316    }
6317    VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6318    return getMemoryInstructionCost(I, VF);
6319  }
6320  case Instruction::ZExt:
6321  case Instruction::SExt:
6322  case Instruction::FPToUI:
6323  case Instruction::FPToSI:
6324  case Instruction::FPExt:
6325  case Instruction::PtrToInt:
6326  case Instruction::IntToPtr:
6327  case Instruction::SIToFP:
6328  case Instruction::UIToFP:
6329  case Instruction::Trunc:
6330  case Instruction::FPTrunc:
6331  case Instruction::BitCast: {
6332    // We optimize the truncation of induction variables having constant
6333    // integer steps. The cost of these truncations is the same as the scalar
6334    // operation.
6335    if (isOptimizableIVTruncate(I, VF)) {
6336      auto *Trunc = cast<TruncInst>(I);
6337      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6338                                  Trunc->getSrcTy(), Trunc);
6339    }
6340
6341    Type *SrcScalarTy = I->getOperand(0)->getType();
6342    Type *SrcVecTy =
6343        VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6344    if (canTruncateToMinimalBitwidth(I, VF)) {
6345      // This cast is going to be shrunk. This may remove the cast or it might
6346      // turn it into slightly different cast. For example, if MinBW == 16,
6347      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6348      //
6349      // Calculate the modified src and dest types.
6350      Type *MinVecTy = VectorTy;
6351      if (I->getOpcode() == Instruction::Trunc) {
6352        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6353        VectorTy =
6354            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6355      } else if (I->getOpcode() == Instruction::ZExt ||
6356                 I->getOpcode() == Instruction::SExt) {
6357        SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6358        VectorTy =
6359            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6360      }
6361    }
6362
6363    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6364    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6365  }
6366  case Instruction::Call: {
6367    bool NeedToScalarize;
6368    CallInst *CI = cast<CallInst>(I);
6369    unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6370    if (getVectorIntrinsicIDForCall(CI, TLI))
6371      return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6372    return CallCost;
6373  }
6374  default:
6375    // The cost of executing VF copies of the scalar instruction. This opcode
6376    // is unknown. Assume that it is the same as 'mul'.
6377    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6378           getScalarizationOverhead(I, VF);
6379  } // end of switch.
6380}
6381
6382char LoopVectorize::ID = 0;
6383
6384static const char lv_name[] = "Loop Vectorization";
6385
6386INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6387INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6388INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6389INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6390INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6391INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6392INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6393INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6394INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6395INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6396INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6397INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6398INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6399INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6400INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6401
6402namespace llvm {
6403
6404Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6405
6406Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6407                              bool VectorizeOnlyWhenForced) {
6408  return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6409}
6410
6411} // end namespace llvm
6412
6413bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6414  // Check if the pointer operand of a load or store instruction is
6415  // consecutive.
6416  if (auto *Ptr = getLoadStorePointerOperand(Inst))
6417    return Legal->isConsecutivePtr(Ptr);
6418  return false;
6419}
6420
6421void LoopVectorizationCostModel::collectValuesToIgnore() {
6422  // Ignore ephemeral values.
6423  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6424
6425  // Ignore type-promoting instructions we identified during reduction
6426  // detection.
6427  for (auto &Reduction : *Legal->getReductionVars()) {
6428    RecurrenceDescriptor &RedDes = Reduction.second;
6429    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6430    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6431  }
6432  // Ignore type-casting instructions we identified during induction
6433  // detection.
6434  for (auto &Induction : *Legal->getInductionVars()) {
6435    InductionDescriptor &IndDes = Induction.second;
6436    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6437    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6438  }
6439}
6440
6441// TODO: we could return a pair of values that specify the max VF and
6442// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6443// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6444// doesn't have a cost model that can choose which plan to execute if
6445// more than one is generated.
6446static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6447                                 LoopVectorizationCostModel &CM) {
6448  unsigned WidestType;
6449  std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6450  return WidestVectorRegBits / WidestType;
6451}
6452
6453VectorizationFactor
6454LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6455  unsigned VF = UserVF;
6456  // Outer loop handling: They may require CFG and instruction level
6457  // transformations before even evaluating whether vectorization is profitable.
6458  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6459  // the vectorization pipeline.
6460  if (!OrigLoop->empty()) {
6461    // If the user doesn't provide a vectorization factor, determine a
6462    // reasonable one.
6463    if (!UserVF) {
6464      VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6465      LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6466
6467      // Make sure we have a VF > 1 for stress testing.
6468      if (VPlanBuildStressTest && VF < 2) {
6469        LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6470                          << "overriding computed VF.\n");
6471        VF = 4;
6472      }
6473    }
6474    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6475    assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6476    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6477                      << " to build VPlans.\n");
6478    buildVPlans(VF, VF);
6479
6480    // For VPlan build stress testing, we bail out after VPlan construction.
6481    if (VPlanBuildStressTest)
6482      return VectorizationFactor::Disabled();
6483
6484    return {VF, 0};
6485  }
6486
6487  LLVM_DEBUG(
6488      dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6489                "VPlan-native path.\n");
6490  return VectorizationFactor::Disabled();
6491}
6492
6493Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6494  assert(OrigLoop->empty() && "Inner loop expected.");
6495  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6496  if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6497    return None;
6498
6499  // Invalidate interleave groups if all blocks of loop will be predicated.
6500  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6501      !useMaskedInterleavedAccesses(*TTI)) {
6502    LLVM_DEBUG(
6503        dbgs()
6504        << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6505           "which requires masked-interleaved support.\n");
6506    CM.InterleaveInfo.reset();
6507  }
6508
6509  if (UserVF) {
6510    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6511    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6512    // Collect the instructions (and their associated costs) that will be more
6513    // profitable to scalarize.
6514    CM.selectUserVectorizationFactor(UserVF);
6515    buildVPlansWithVPRecipes(UserVF, UserVF);
6516    LLVM_DEBUG(printPlans(dbgs()));
6517    return {{UserVF, 0}};
6518  }
6519
6520  unsigned MaxVF = MaybeMaxVF.getValue();
6521  assert(MaxVF != 0 && "MaxVF is zero.");
6522
6523  for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6524    // Collect Uniform and Scalar instructions after vectorization with VF.
6525    CM.collectUniformsAndScalars(VF);
6526
6527    // Collect the instructions (and their associated costs) that will be more
6528    // profitable to scalarize.
6529    if (VF > 1)
6530      CM.collectInstsToScalarize(VF);
6531  }
6532
6533  buildVPlansWithVPRecipes(1, MaxVF);
6534  LLVM_DEBUG(printPlans(dbgs()));
6535  if (MaxVF == 1)
6536    return VectorizationFactor::Disabled();
6537
6538  // Select the optimal vectorization factor.
6539  return CM.selectVectorizationFactor(MaxVF);
6540}
6541
6542void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6543  LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6544                    << '\n');
6545  BestVF = VF;
6546  BestUF = UF;
6547
6548  erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6549    return !Plan->hasVF(VF);
6550  });
6551  assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6552}
6553
6554void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6555                                           DominatorTree *DT) {
6556  // Perform the actual loop transformation.
6557
6558  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6559  VPCallbackILV CallbackILV(ILV);
6560
6561  VPTransformState State{BestVF, BestUF,      LI,
6562                         DT,     ILV.Builder, ILV.VectorLoopValueMap,
6563                         &ILV,   CallbackILV};
6564  State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6565  State.TripCount = ILV.getOrCreateTripCount(nullptr);
6566
6567  //===------------------------------------------------===//
6568  //
6569  // Notice: any optimization or new instruction that go
6570  // into the code below should also be implemented in
6571  // the cost-model.
6572  //
6573  //===------------------------------------------------===//
6574
6575  // 2. Copy and widen instructions from the old loop into the new loop.
6576  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6577  VPlans.front()->execute(&State);
6578
6579  // 3. Fix the vectorized code: take care of header phi's, live-outs,
6580  //    predication, updating analyses.
6581  ILV.fixVectorizedLoop();
6582}
6583
6584void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6585    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6586  BasicBlock *Latch = OrigLoop->getLoopLatch();
6587
6588  // We create new control-flow for the vectorized loop, so the original
6589  // condition will be dead after vectorization if it's only used by the
6590  // branch.
6591  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6592  if (Cmp && Cmp->hasOneUse())
6593    DeadInstructions.insert(Cmp);
6594
6595  // We create new "steps" for induction variable updates to which the original
6596  // induction variables map. An original update instruction will be dead if
6597  // all its users except the induction variable are dead.
6598  for (auto &Induction : *Legal->getInductionVars()) {
6599    PHINode *Ind = Induction.first;
6600    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6601    if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6602          return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6603                                 DeadInstructions.end();
6604        }))
6605      DeadInstructions.insert(IndUpdate);
6606
6607    // We record as "Dead" also the type-casting instructions we had identified
6608    // during induction analysis. We don't need any handling for them in the
6609    // vectorized loop because we have proven that, under a proper runtime
6610    // test guarding the vectorized loop, the value of the phi, and the casted
6611    // value of the phi, are the same. The last instruction in this casting chain
6612    // will get its scalar/vector/widened def from the scalar/vector/widened def
6613    // of the respective phi node. Any other casts in the induction def-use chain
6614    // have no other uses outside the phi update chain, and will be ignored.
6615    InductionDescriptor &IndDes = Induction.second;
6616    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6617    DeadInstructions.insert(Casts.begin(), Casts.end());
6618  }
6619}
6620
6621Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6622
6623Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6624
6625Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6626                                        Instruction::BinaryOps BinOp) {
6627  // When unrolling and the VF is 1, we only need to add a simple scalar.
6628  Type *Ty = Val->getType();
6629  assert(!Ty->isVectorTy() && "Val must be a scalar");
6630
6631  if (Ty->isFloatingPointTy()) {
6632    Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6633
6634    // Floating point operations had to be 'fast' to enable the unrolling.
6635    Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6636    return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6637  }
6638  Constant *C = ConstantInt::get(Ty, StartIdx);
6639  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6640}
6641
6642static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6643  SmallVector<Metadata *, 4> MDs;
6644  // Reserve first location for self reference to the LoopID metadata node.
6645  MDs.push_back(nullptr);
6646  bool IsUnrollMetadata = false;
6647  MDNode *LoopID = L->getLoopID();
6648  if (LoopID) {
6649    // First find existing loop unrolling disable metadata.
6650    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6651      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6652      if (MD) {
6653        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6654        IsUnrollMetadata =
6655            S && S->getString().startswith("llvm.loop.unroll.disable");
6656      }
6657      MDs.push_back(LoopID->getOperand(i));
6658    }
6659  }
6660
6661  if (!IsUnrollMetadata) {
6662    // Add runtime unroll disable metadata.
6663    LLVMContext &Context = L->getHeader()->getContext();
6664    SmallVector<Metadata *, 1> DisableOperands;
6665    DisableOperands.push_back(
6666        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6667    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6668    MDs.push_back(DisableNode);
6669    MDNode *NewLoopID = MDNode::get(Context, MDs);
6670    // Set operand 0 to refer to the loop id itself.
6671    NewLoopID->replaceOperandWith(0, NewLoopID);
6672    L->setLoopID(NewLoopID);
6673  }
6674}
6675
6676bool LoopVectorizationPlanner::getDecisionAndClampRange(
6677    const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6678  assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6679  bool PredicateAtRangeStart = Predicate(Range.Start);
6680
6681  for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6682    if (Predicate(TmpVF) != PredicateAtRangeStart) {
6683      Range.End = TmpVF;
6684      break;
6685    }
6686
6687  return PredicateAtRangeStart;
6688}
6689
6690/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6691/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6692/// of VF's starting at a given VF and extending it as much as possible. Each
6693/// vectorization decision can potentially shorten this sub-range during
6694/// buildVPlan().
6695void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6696  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6697    VFRange SubRange = {VF, MaxVF + 1};
6698    VPlans.push_back(buildVPlan(SubRange));
6699    VF = SubRange.End;
6700  }
6701}
6702
6703VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6704                                         VPlanPtr &Plan) {
6705  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6706
6707  // Look for cached value.
6708  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6709  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6710  if (ECEntryIt != EdgeMaskCache.end())
6711    return ECEntryIt->second;
6712
6713  VPValue *SrcMask = createBlockInMask(Src, Plan);
6714
6715  // The terminator has to be a branch inst!
6716  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6717  assert(BI && "Unexpected terminator found");
6718
6719  if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6720    return EdgeMaskCache[Edge] = SrcMask;
6721
6722  VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6723  assert(EdgeMask && "No Edge Mask found for condition");
6724
6725  if (BI->getSuccessor(0) != Dst)
6726    EdgeMask = Builder.createNot(EdgeMask);
6727
6728  if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6729    EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6730
6731  return EdgeMaskCache[Edge] = EdgeMask;
6732}
6733
6734VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6735  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6736
6737  // Look for cached value.
6738  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6739  if (BCEntryIt != BlockMaskCache.end())
6740    return BCEntryIt->second;
6741
6742  // All-one mask is modelled as no-mask following the convention for masked
6743  // load/store/gather/scatter. Initialize BlockMask to no-mask.
6744  VPValue *BlockMask = nullptr;
6745
6746  if (OrigLoop->getHeader() == BB) {
6747    if (!CM.blockNeedsPredication(BB))
6748      return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6749
6750    // Introduce the early-exit compare IV <= BTC to form header block mask.
6751    // This is used instead of IV < TC because TC may wrap, unlike BTC.
6752    VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6753    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6754    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6755    return BlockMaskCache[BB] = BlockMask;
6756  }
6757
6758  // This is the block mask. We OR all incoming edges.
6759  for (auto *Predecessor : predecessors(BB)) {
6760    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6761    if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6762      return BlockMaskCache[BB] = EdgeMask;
6763
6764    if (!BlockMask) { // BlockMask has its initialized nullptr value.
6765      BlockMask = EdgeMask;
6766      continue;
6767    }
6768
6769    BlockMask = Builder.createOr(BlockMask, EdgeMask);
6770  }
6771
6772  return BlockMaskCache[BB] = BlockMask;
6773}
6774
6775VPWidenMemoryInstructionRecipe *
6776VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6777                                  VPlanPtr &Plan) {
6778  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6779    return nullptr;
6780
6781  auto willWiden = [&](unsigned VF) -> bool {
6782    if (VF == 1)
6783      return false;
6784    LoopVectorizationCostModel::InstWidening Decision =
6785        CM.getWideningDecision(I, VF);
6786    assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6787           "CM decision should be taken at this point.");
6788    if (Decision == LoopVectorizationCostModel::CM_Interleave)
6789      return true;
6790    if (CM.isScalarAfterVectorization(I, VF) ||
6791        CM.isProfitableToScalarize(I, VF))
6792      return false;
6793    return Decision != LoopVectorizationCostModel::CM_Scalarize;
6794  };
6795
6796  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6797    return nullptr;
6798
6799  VPValue *Mask = nullptr;
6800  if (Legal->isMaskRequired(I))
6801    Mask = createBlockInMask(I->getParent(), Plan);
6802
6803  VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6804  return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6805}
6806
6807VPWidenIntOrFpInductionRecipe *
6808VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6809  if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6810    // Check if this is an integer or fp induction. If so, build the recipe that
6811    // produces its scalar and vector values.
6812    InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6813    if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6814        II.getKind() == InductionDescriptor::IK_FpInduction)
6815      return new VPWidenIntOrFpInductionRecipe(Phi);
6816
6817    return nullptr;
6818  }
6819
6820  // Optimize the special case where the source is a constant integer
6821  // induction variable. Notice that we can only optimize the 'trunc' case
6822  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6823  // (c) other casts depend on pointer size.
6824
6825  // Determine whether \p K is a truncation based on an induction variable that
6826  // can be optimized.
6827  auto isOptimizableIVTruncate =
6828      [&](Instruction *K) -> std::function<bool(unsigned)> {
6829    return
6830        [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6831  };
6832
6833  if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6834                               isOptimizableIVTruncate(I), Range))
6835    return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6836                                             cast<TruncInst>(I));
6837  return nullptr;
6838}
6839
6840VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6841  PHINode *Phi = dyn_cast<PHINode>(I);
6842  if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6843    return nullptr;
6844
6845  // We know that all PHIs in non-header blocks are converted into selects, so
6846  // we don't have to worry about the insertion order and we can just use the
6847  // builder. At this point we generate the predication tree. There may be
6848  // duplications since this is a simple recursive scan, but future
6849  // optimizations will clean it up.
6850
6851  SmallVector<VPValue *, 2> Masks;
6852  unsigned NumIncoming = Phi->getNumIncomingValues();
6853  for (unsigned In = 0; In < NumIncoming; In++) {
6854    VPValue *EdgeMask =
6855      createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6856    assert((EdgeMask || NumIncoming == 1) &&
6857           "Multiple predecessors with one having a full mask");
6858    if (EdgeMask)
6859      Masks.push_back(EdgeMask);
6860  }
6861  return new VPBlendRecipe(Phi, Masks);
6862}
6863
6864bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6865                                 VFRange &Range) {
6866
6867  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6868      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6869
6870  if (IsPredicated)
6871    return false;
6872
6873  auto IsVectorizableOpcode = [](unsigned Opcode) {
6874    switch (Opcode) {
6875    case Instruction::Add:
6876    case Instruction::And:
6877    case Instruction::AShr:
6878    case Instruction::BitCast:
6879    case Instruction::Br:
6880    case Instruction::Call:
6881    case Instruction::FAdd:
6882    case Instruction::FCmp:
6883    case Instruction::FDiv:
6884    case Instruction::FMul:
6885    case Instruction::FNeg:
6886    case Instruction::FPExt:
6887    case Instruction::FPToSI:
6888    case Instruction::FPToUI:
6889    case Instruction::FPTrunc:
6890    case Instruction::FRem:
6891    case Instruction::FSub:
6892    case Instruction::ICmp:
6893    case Instruction::IntToPtr:
6894    case Instruction::Load:
6895    case Instruction::LShr:
6896    case Instruction::Mul:
6897    case Instruction::Or:
6898    case Instruction::PHI:
6899    case Instruction::PtrToInt:
6900    case Instruction::SDiv:
6901    case Instruction::Select:
6902    case Instruction::SExt:
6903    case Instruction::Shl:
6904    case Instruction::SIToFP:
6905    case Instruction::SRem:
6906    case Instruction::Store:
6907    case Instruction::Sub:
6908    case Instruction::Trunc:
6909    case Instruction::UDiv:
6910    case Instruction::UIToFP:
6911    case Instruction::URem:
6912    case Instruction::Xor:
6913    case Instruction::ZExt:
6914      return true;
6915    }
6916    return false;
6917  };
6918
6919  if (!IsVectorizableOpcode(I->getOpcode()))
6920    return false;
6921
6922  if (CallInst *CI = dyn_cast<CallInst>(I)) {
6923    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6924    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6925               ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6926      return false;
6927  }
6928
6929  auto willWiden = [&](unsigned VF) -> bool {
6930    if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6931                             CM.isProfitableToScalarize(I, VF)))
6932      return false;
6933    if (CallInst *CI = dyn_cast<CallInst>(I)) {
6934      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6935      // The following case may be scalarized depending on the VF.
6936      // The flag shows whether we use Intrinsic or a usual Call for vectorized
6937      // version of the instruction.
6938      // Is it beneficial to perform intrinsic call compared to lib call?
6939      bool NeedToScalarize;
6940      unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6941      bool UseVectorIntrinsic =
6942          ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6943      return UseVectorIntrinsic || !NeedToScalarize;
6944    }
6945    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6946      assert(CM.getWideningDecision(I, VF) ==
6947                 LoopVectorizationCostModel::CM_Scalarize &&
6948             "Memory widening decisions should have been taken care by now");
6949      return false;
6950    }
6951    return true;
6952  };
6953
6954  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6955    return false;
6956  // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6957  // to avoid having to split recipes later.
6958  bool IsSingleton = Ingredient2Recipe.count(I);
6959
6960  // Success: widen this instruction.
6961
6962  // Use the default widening recipe. We optimize the common case where
6963  // consecutive instructions can be represented by a single recipe.
6964  if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6965      LastExtensibleRecipe->appendInstruction(I))
6966    return true;
6967
6968  VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6969  if (!IsSingleton)
6970    LastExtensibleRecipe = WidenRecipe;
6971  setRecipe(I, WidenRecipe);
6972  VPBB->appendRecipe(WidenRecipe);
6973  return true;
6974}
6975
6976VPBasicBlock *VPRecipeBuilder::handleReplication(
6977    Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6978    DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6979    VPlanPtr &Plan) {
6980  bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6981      [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6982      Range);
6983
6984  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6985      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6986
6987  auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6988  setRecipe(I, Recipe);
6989
6990  // Find if I uses a predicated instruction. If so, it will use its scalar
6991  // value. Avoid hoisting the insert-element which packs the scalar value into
6992  // a vector value, as that happens iff all users use the vector value.
6993  for (auto &Op : I->operands())
6994    if (auto *PredInst = dyn_cast<Instruction>(Op))
6995      if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6996        PredInst2Recipe[PredInst]->setAlsoPack(false);
6997
6998  // Finalize the recipe for Instr, first if it is not predicated.
6999  if (!IsPredicated) {
7000    LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7001    VPBB->appendRecipe(Recipe);
7002    return VPBB;
7003  }
7004  LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7005  assert(VPBB->getSuccessors().empty() &&
7006         "VPBB has successors when handling predicated replication.");
7007  // Record predicated instructions for above packing optimizations.
7008  PredInst2Recipe[I] = Recipe;
7009  VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7010  VPBlockUtils::insertBlockAfter(Region, VPBB);
7011  auto *RegSucc = new VPBasicBlock();
7012  VPBlockUtils::insertBlockAfter(RegSucc, Region);
7013  return RegSucc;
7014}
7015
7016VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7017                                                      VPRecipeBase *PredRecipe,
7018                                                      VPlanPtr &Plan) {
7019  // Instructions marked for predication are replicated and placed under an
7020  // if-then construct to prevent side-effects.
7021
7022  // Generate recipes to compute the block mask for this region.
7023  VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7024
7025  // Build the triangular if-then region.
7026  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7027  assert(Instr->getParent() && "Predicated instruction not in any basic block");
7028  auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7029  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7030  auto *PHIRecipe =
7031      Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7032  auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7033  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7034  VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7035
7036  // Note: first set Entry as region entry and then connect successors starting
7037  // from it in order, to propagate the "parent" of each VPBasicBlock.
7038  VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7039  VPBlockUtils::connectBlocks(Pred, Exit);
7040
7041  return Region;
7042}
7043
7044bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7045                                        VPlanPtr &Plan, VPBasicBlock *VPBB) {
7046  VPRecipeBase *Recipe = nullptr;
7047
7048  // First, check for specific widening recipes that deal with memory
7049  // operations, inductions and Phi nodes.
7050  if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7051      (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7052      (Recipe = tryToBlend(Instr, Plan)) ||
7053      (isa<PHINode>(Instr) &&
7054       (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7055    setRecipe(Instr, Recipe);
7056    VPBB->appendRecipe(Recipe);
7057    return true;
7058  }
7059
7060  // Handle GEP widening.
7061  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7062    auto Scalarize = [&](unsigned VF) {
7063      return CM.isScalarWithPredication(Instr, VF) ||
7064             CM.isScalarAfterVectorization(Instr, VF) ||
7065             CM.isProfitableToScalarize(Instr, VF);
7066    };
7067    if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7068      return false;
7069    VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7070    setRecipe(Instr, Recipe);
7071    VPBB->appendRecipe(Recipe);
7072    return true;
7073  }
7074
7075  // Check if Instr is to be widened by a general VPWidenRecipe, after
7076  // having first checked for specific widening recipes.
7077  if (tryToWiden(Instr, VPBB, Range))
7078    return true;
7079
7080  return false;
7081}
7082
7083void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7084                                                        unsigned MaxVF) {
7085  assert(OrigLoop->empty() && "Inner loop expected.");
7086
7087  // Collect conditions feeding internal conditional branches; they need to be
7088  // represented in VPlan for it to model masking.
7089  SmallPtrSet<Value *, 1> NeedDef;
7090
7091  auto *Latch = OrigLoop->getLoopLatch();
7092  for (BasicBlock *BB : OrigLoop->blocks()) {
7093    if (BB == Latch)
7094      continue;
7095    BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7096    if (Branch && Branch->isConditional())
7097      NeedDef.insert(Branch->getCondition());
7098  }
7099
7100  // If the tail is to be folded by masking, the primary induction variable
7101  // needs to be represented in VPlan for it to model early-exit masking.
7102  // Also, both the Phi and the live-out instruction of each reduction are
7103  // required in order to introduce a select between them in VPlan.
7104  if (CM.foldTailByMasking()) {
7105    NeedDef.insert(Legal->getPrimaryInduction());
7106    for (auto &Reduction : *Legal->getReductionVars()) {
7107      NeedDef.insert(Reduction.first);
7108      NeedDef.insert(Reduction.second.getLoopExitInstr());
7109    }
7110  }
7111
7112  // Collect instructions from the original loop that will become trivially dead
7113  // in the vectorized loop. We don't need to vectorize these instructions. For
7114  // example, original induction update instructions can become dead because we
7115  // separately emit induction "steps" when generating code for the new loop.
7116  // Similarly, we create a new latch condition when setting up the structure
7117  // of the new loop, so the old one can become dead.
7118  SmallPtrSet<Instruction *, 4> DeadInstructions;
7119  collectTriviallyDeadInstructions(DeadInstructions);
7120
7121  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7122  // Dead instructions do not need sinking. Remove them from SinkAfter.
7123  for (Instruction *I : DeadInstructions)
7124    SinkAfter.erase(I);
7125
7126  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7127    VFRange SubRange = {VF, MaxVF + 1};
7128    VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7129                                             DeadInstructions, SinkAfter));
7130    VF = SubRange.End;
7131  }
7132}
7133
7134VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7135    VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7136    SmallPtrSetImpl<Instruction *> &DeadInstructions,
7137    const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7138
7139  // Hold a mapping from predicated instructions to their recipes, in order to
7140  // fix their AlsoPack behavior if a user is determined to replicate and use a
7141  // scalar instead of vector value.
7142  DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7143
7144  SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7145
7146  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7147
7148  // ---------------------------------------------------------------------------
7149  // Pre-construction: record ingredients whose recipes we'll need to further
7150  // process after constructing the initial VPlan.
7151  // ---------------------------------------------------------------------------
7152
7153  // Mark instructions we'll need to sink later and their targets as
7154  // ingredients whose recipe we'll need to record.
7155  for (auto &Entry : SinkAfter) {
7156    RecipeBuilder.recordRecipeOf(Entry.first);
7157    RecipeBuilder.recordRecipeOf(Entry.second);
7158  }
7159
7160  // For each interleave group which is relevant for this (possibly trimmed)
7161  // Range, add it to the set of groups to be later applied to the VPlan and add
7162  // placeholders for its members' Recipes which we'll be replacing with a
7163  // single VPInterleaveRecipe.
7164  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7165    auto applyIG = [IG, this](unsigned VF) -> bool {
7166      return (VF >= 2 && // Query is illegal for VF == 1
7167              CM.getWideningDecision(IG->getInsertPos(), VF) ==
7168                  LoopVectorizationCostModel::CM_Interleave);
7169    };
7170    if (!getDecisionAndClampRange(applyIG, Range))
7171      continue;
7172    InterleaveGroups.insert(IG);
7173    for (unsigned i = 0; i < IG->getFactor(); i++)
7174      if (Instruction *Member = IG->getMember(i))
7175        RecipeBuilder.recordRecipeOf(Member);
7176  };
7177
7178  // ---------------------------------------------------------------------------
7179  // Build initial VPlan: Scan the body of the loop in a topological order to
7180  // visit each basic block after having visited its predecessor basic blocks.
7181  // ---------------------------------------------------------------------------
7182
7183  // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7184  VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7185  auto Plan = std::make_unique<VPlan>(VPBB);
7186
7187  // Represent values that will have defs inside VPlan.
7188  for (Value *V : NeedDef)
7189    Plan->addVPValue(V);
7190
7191  // Scan the body of the loop in a topological order to visit each basic block
7192  // after having visited its predecessor basic blocks.
7193  LoopBlocksDFS DFS(OrigLoop);
7194  DFS.perform(LI);
7195
7196  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7197    // Relevant instructions from basic block BB will be grouped into VPRecipe
7198    // ingredients and fill a new VPBasicBlock.
7199    unsigned VPBBsForBB = 0;
7200    auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7201    VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7202    VPBB = FirstVPBBForBB;
7203    Builder.setInsertPoint(VPBB);
7204
7205    // Introduce each ingredient into VPlan.
7206    for (Instruction &I : BB->instructionsWithoutDebug()) {
7207      Instruction *Instr = &I;
7208
7209      // First filter out irrelevant instructions, to ensure no recipes are
7210      // built for them.
7211      if (isa<BranchInst>(Instr) ||
7212          DeadInstructions.find(Instr) != DeadInstructions.end())
7213        continue;
7214
7215      if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7216        continue;
7217
7218      // Otherwise, if all widening options failed, Instruction is to be
7219      // replicated. This may create a successor for VPBB.
7220      VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7221          Instr, Range, VPBB, PredInst2Recipe, Plan);
7222      if (NextVPBB != VPBB) {
7223        VPBB = NextVPBB;
7224        VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7225                                    : "");
7226      }
7227    }
7228  }
7229
7230  // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7231  // may also be empty, such as the last one VPBB, reflecting original
7232  // basic-blocks with no recipes.
7233  VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7234  assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7235  VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7236  VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7237  delete PreEntry;
7238
7239  // ---------------------------------------------------------------------------
7240  // Transform initial VPlan: Apply previously taken decisions, in order, to
7241  // bring the VPlan to its final state.
7242  // ---------------------------------------------------------------------------
7243
7244  // Apply Sink-After legal constraints.
7245  for (auto &Entry : SinkAfter) {
7246    VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7247    VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7248    Sink->moveAfter(Target);
7249  }
7250
7251  // Interleave memory: for each Interleave Group we marked earlier as relevant
7252  // for this VPlan, replace the Recipes widening its memory instructions with a
7253  // single VPInterleaveRecipe at its insertion point.
7254  for (auto IG : InterleaveGroups) {
7255    auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7256        RecipeBuilder.getRecipe(IG->getInsertPos()));
7257    (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7258        ->insertBefore(Recipe);
7259
7260    for (unsigned i = 0; i < IG->getFactor(); ++i)
7261      if (Instruction *Member = IG->getMember(i)) {
7262        RecipeBuilder.getRecipe(Member)->eraseFromParent();
7263      }
7264  }
7265
7266  // Finally, if tail is folded by masking, introduce selects between the phi
7267  // and the live-out instruction of each reduction, at the end of the latch.
7268  if (CM.foldTailByMasking()) {
7269    Builder.setInsertPoint(VPBB);
7270    auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7271    for (auto &Reduction : *Legal->getReductionVars()) {
7272      VPValue *Phi = Plan->getVPValue(Reduction.first);
7273      VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7274      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7275    }
7276  }
7277
7278  std::string PlanName;
7279  raw_string_ostream RSO(PlanName);
7280  unsigned VF = Range.Start;
7281  Plan->addVF(VF);
7282  RSO << "Initial VPlan for VF={" << VF;
7283  for (VF *= 2; VF < Range.End; VF *= 2) {
7284    Plan->addVF(VF);
7285    RSO << "," << VF;
7286  }
7287  RSO << "},UF>=1";
7288  RSO.flush();
7289  Plan->setName(PlanName);
7290
7291  return Plan;
7292}
7293
7294VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7295  // Outer loop handling: They may require CFG and instruction level
7296  // transformations before even evaluating whether vectorization is profitable.
7297  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7298  // the vectorization pipeline.
7299  assert(!OrigLoop->empty());
7300  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7301
7302  // Create new empty VPlan
7303  auto Plan = std::make_unique<VPlan>();
7304
7305  // Build hierarchical CFG
7306  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7307  HCFGBuilder.buildHierarchicalCFG();
7308
7309  for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7310    Plan->addVF(VF);
7311
7312  if (EnableVPlanPredication) {
7313    VPlanPredicator VPP(*Plan);
7314    VPP.predicate();
7315
7316    // Avoid running transformation to recipes until masked code generation in
7317    // VPlan-native path is in place.
7318    return Plan;
7319  }
7320
7321  SmallPtrSet<Instruction *, 1> DeadInstructions;
7322  VPlanTransforms::VPInstructionsToVPRecipes(
7323      OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7324  return Plan;
7325}
7326
7327Value* LoopVectorizationPlanner::VPCallbackILV::
7328getOrCreateVectorValues(Value *V, unsigned Part) {
7329      return ILV.getOrCreateVectorValue(V, Part);
7330}
7331
7332Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7333    Value *V, const VPIteration &Instance) {
7334  return ILV.getOrCreateScalarValue(V, Instance);
7335}
7336
7337void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7338  O << " +\n"
7339    << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7340  IG->getInsertPos()->printAsOperand(O, false);
7341  O << ", ";
7342  getAddr()->printAsOperand(O);
7343  VPValue *Mask = getMask();
7344  if (Mask) {
7345    O << ", ";
7346    Mask->printAsOperand(O);
7347  }
7348  O << "\\l\"";
7349  for (unsigned i = 0; i < IG->getFactor(); ++i)
7350    if (Instruction *I = IG->getMember(i))
7351      O << " +\n"
7352        << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7353}
7354
7355void VPWidenRecipe::execute(VPTransformState &State) {
7356  for (auto &Instr : make_range(Begin, End))
7357    State.ILV->widenInstruction(Instr);
7358}
7359
7360void VPWidenGEPRecipe::execute(VPTransformState &State) {
7361  State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7362                      IsIndexLoopInvariant);
7363}
7364
7365void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7366  assert(!State.Instance && "Int or FP induction being replicated.");
7367  State.ILV->widenIntOrFpInduction(IV, Trunc);
7368}
7369
7370void VPWidenPHIRecipe::execute(VPTransformState &State) {
7371  State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7372}
7373
7374void VPBlendRecipe::execute(VPTransformState &State) {
7375  State.ILV->setDebugLocFromInst(State.Builder, Phi);
7376  // We know that all PHIs in non-header blocks are converted into
7377  // selects, so we don't have to worry about the insertion order and we
7378  // can just use the builder.
7379  // At this point we generate the predication tree. There may be
7380  // duplications since this is a simple recursive scan, but future
7381  // optimizations will clean it up.
7382
7383  unsigned NumIncoming = Phi->getNumIncomingValues();
7384
7385  assert((User || NumIncoming == 1) &&
7386         "Multiple predecessors with predecessors having a full mask");
7387  // Generate a sequence of selects of the form:
7388  // SELECT(Mask3, In3,
7389  //      SELECT(Mask2, In2,
7390  //                   ( ...)))
7391  InnerLoopVectorizer::VectorParts Entry(State.UF);
7392  for (unsigned In = 0; In < NumIncoming; ++In) {
7393    for (unsigned Part = 0; Part < State.UF; ++Part) {
7394      // We might have single edge PHIs (blocks) - use an identity
7395      // 'select' for the first PHI operand.
7396      Value *In0 =
7397          State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7398      if (In == 0)
7399        Entry[Part] = In0; // Initialize with the first incoming value.
7400      else {
7401        // Select between the current value and the previous incoming edge
7402        // based on the incoming mask.
7403        Value *Cond = State.get(User->getOperand(In), Part);
7404        Entry[Part] =
7405            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7406      }
7407    }
7408  }
7409  for (unsigned Part = 0; Part < State.UF; ++Part)
7410    State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7411}
7412
7413void VPInterleaveRecipe::execute(VPTransformState &State) {
7414  assert(!State.Instance && "Interleave group being replicated.");
7415  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7416                                      getMask());
7417}
7418
7419void VPReplicateRecipe::execute(VPTransformState &State) {
7420  if (State.Instance) { // Generate a single instance.
7421    State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7422    // Insert scalar instance packing it into a vector.
7423    if (AlsoPack && State.VF > 1) {
7424      // If we're constructing lane 0, initialize to start from undef.
7425      if (State.Instance->Lane == 0) {
7426        Value *Undef =
7427            UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7428        State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7429      }
7430      State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7431    }
7432    return;
7433  }
7434
7435  // Generate scalar instances for all VF lanes of all UF parts, unless the
7436  // instruction is uniform inwhich case generate only the first lane for each
7437  // of the UF parts.
7438  unsigned EndLane = IsUniform ? 1 : State.VF;
7439  for (unsigned Part = 0; Part < State.UF; ++Part)
7440    for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7441      State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7442}
7443
7444void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7445  assert(State.Instance && "Branch on Mask works only on single instance.");
7446
7447  unsigned Part = State.Instance->Part;
7448  unsigned Lane = State.Instance->Lane;
7449
7450  Value *ConditionBit = nullptr;
7451  if (!User) // Block in mask is all-one.
7452    ConditionBit = State.Builder.getTrue();
7453  else {
7454    VPValue *BlockInMask = User->getOperand(0);
7455    ConditionBit = State.get(BlockInMask, Part);
7456    if (ConditionBit->getType()->isVectorTy())
7457      ConditionBit = State.Builder.CreateExtractElement(
7458          ConditionBit, State.Builder.getInt32(Lane));
7459  }
7460
7461  // Replace the temporary unreachable terminator with a new conditional branch,
7462  // whose two destinations will be set later when they are created.
7463  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7464  assert(isa<UnreachableInst>(CurrentTerminator) &&
7465         "Expected to replace unreachable terminator with conditional branch.");
7466  auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7467  CondBr->setSuccessor(0, nullptr);
7468  ReplaceInstWithInst(CurrentTerminator, CondBr);
7469}
7470
7471void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7472  assert(State.Instance && "Predicated instruction PHI works per instance.");
7473  Instruction *ScalarPredInst = cast<Instruction>(
7474      State.ValueMap.getScalarValue(PredInst, *State.Instance));
7475  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7476  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7477  assert(PredicatingBB && "Predicated block has no single predecessor.");
7478
7479  // By current pack/unpack logic we need to generate only a single phi node: if
7480  // a vector value for the predicated instruction exists at this point it means
7481  // the instruction has vector users only, and a phi for the vector value is
7482  // needed. In this case the recipe of the predicated instruction is marked to
7483  // also do that packing, thereby "hoisting" the insert-element sequence.
7484  // Otherwise, a phi node for the scalar value is needed.
7485  unsigned Part = State.Instance->Part;
7486  if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7487    Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7488    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7489    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7490    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7491    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7492    State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7493  } else {
7494    Type *PredInstType = PredInst->getType();
7495    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7496    Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7497    Phi->addIncoming(ScalarPredInst, PredicatedBB);
7498    State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7499  }
7500}
7501
7502void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7503  State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7504}
7505
7506// Determine how to lower the scalar epilogue, which depends on 1) optimising
7507// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7508// predication, and 4) a TTI hook that analyses whether the loop is suitable
7509// for predication.
7510static ScalarEpilogueLowering getScalarEpilogueLowering(
7511    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7512    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7513    AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7514    LoopVectorizationLegality &LVL) {
7515  bool OptSize =
7516      F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7517                                                     PGSOQueryType::IRPass);
7518  // 1) OptSize takes precedence over all other options, i.e. if this is set,
7519  // don't look at hints or options, and don't request a scalar epilogue.
7520  if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7521    return CM_ScalarEpilogueNotAllowedOptSize;
7522
7523  bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7524                              !PreferPredicateOverEpilog;
7525
7526  // 2) Next, if disabling predication is requested on the command line, honour
7527  // this and request a scalar epilogue. Also do this if we don't have a
7528  // primary induction variable, which is required for predication.
7529  if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7530    return CM_ScalarEpilogueAllowed;
7531
7532  // 3) and 4) look if enabling predication is requested on the command line,
7533  // with a loop hint, or if the TTI hook indicates this is profitable, request
7534  // predication .
7535  if (PreferPredicateOverEpilog ||
7536      Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7537      (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7538                                        LVL.getLAI()) &&
7539       Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7540    return CM_ScalarEpilogueNotNeededUsePredicate;
7541
7542  return CM_ScalarEpilogueAllowed;
7543}
7544
7545// Process the loop in the VPlan-native vectorization path. This path builds
7546// VPlan upfront in the vectorization pipeline, which allows to apply
7547// VPlan-to-VPlan transformations from the very beginning without modifying the
7548// input LLVM IR.
7549static bool processLoopInVPlanNativePath(
7550    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7551    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7552    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7553    OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7554    ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7555
7556  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7557  Function *F = L->getHeader()->getParent();
7558  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7559
7560  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7561      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7562
7563  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7564                                &Hints, IAI);
7565  // Use the planner for outer loop vectorization.
7566  // TODO: CM is not used at this point inside the planner. Turn CM into an
7567  // optional argument if we don't need it in the future.
7568  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7569
7570  // Get user vectorization factor.
7571  const unsigned UserVF = Hints.getWidth();
7572
7573  // Plan how to best vectorize, return the best VF and its cost.
7574  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7575
7576  // If we are stress testing VPlan builds, do not attempt to generate vector
7577  // code. Masked vector code generation support will follow soon.
7578  // Also, do not attempt to vectorize if no vector code will be produced.
7579  if (VPlanBuildStressTest || EnableVPlanPredication ||
7580      VectorizationFactor::Disabled() == VF)
7581    return false;
7582
7583  LVP.setBestPlan(VF.Width, 1);
7584
7585  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7586                         &CM);
7587  LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7588                    << L->getHeader()->getParent()->getName() << "\"\n");
7589  LVP.executePlan(LB, DT);
7590
7591  // Mark the loop as already vectorized to avoid vectorizing again.
7592  Hints.setAlreadyVectorized();
7593
7594  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7595  return true;
7596}
7597
7598bool LoopVectorizePass::processLoop(Loop *L) {
7599  assert((EnableVPlanNativePath || L->empty()) &&
7600         "VPlan-native path is not enabled. Only process inner loops.");
7601
7602#ifndef NDEBUG
7603  const std::string DebugLocStr = getDebugLocString(L);
7604#endif /* NDEBUG */
7605
7606  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7607                    << L->getHeader()->getParent()->getName() << "\" from "
7608                    << DebugLocStr << "\n");
7609
7610  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7611
7612  LLVM_DEBUG(
7613      dbgs() << "LV: Loop hints:"
7614             << " force="
7615             << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7616                     ? "disabled"
7617                     : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7618                            ? "enabled"
7619                            : "?"))
7620             << " width=" << Hints.getWidth()
7621             << " unroll=" << Hints.getInterleave() << "\n");
7622
7623  // Function containing loop
7624  Function *F = L->getHeader()->getParent();
7625
7626  // Looking at the diagnostic output is the only way to determine if a loop
7627  // was vectorized (other than looking at the IR or machine code), so it
7628  // is important to generate an optimization remark for each loop. Most of
7629  // these messages are generated as OptimizationRemarkAnalysis. Remarks
7630  // generated as OptimizationRemark and OptimizationRemarkMissed are
7631  // less verbose reporting vectorized loops and unvectorized loops that may
7632  // benefit from vectorization, respectively.
7633
7634  if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7635    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7636    return false;
7637  }
7638
7639  PredicatedScalarEvolution PSE(*SE, *L);
7640
7641  // Check if it is legal to vectorize the loop.
7642  LoopVectorizationRequirements Requirements(*ORE);
7643  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7644                                &Requirements, &Hints, DB, AC);
7645  if (!LVL.canVectorize(EnableVPlanNativePath)) {
7646    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7647    Hints.emitRemarkWithHints();
7648    return false;
7649  }
7650
7651  // Check the function attributes and profiles to find out if this function
7652  // should be optimized for size.
7653  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7654      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7655
7656  // Entrance to the VPlan-native vectorization path. Outer loops are processed
7657  // here. They may require CFG and instruction level transformations before
7658  // even evaluating whether vectorization is profitable. Since we cannot modify
7659  // the incoming IR, we need to build VPlan upfront in the vectorization
7660  // pipeline.
7661  if (!L->empty())
7662    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7663                                        ORE, BFI, PSI, Hints);
7664
7665  assert(L->empty() && "Inner loop expected.");
7666
7667  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7668  // count by optimizing for size, to minimize overheads.
7669  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7670  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7671    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7672                      << "This loop is worth vectorizing only if no scalar "
7673                      << "iteration overheads are incurred.");
7674    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7675      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7676    else {
7677      LLVM_DEBUG(dbgs() << "\n");
7678      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7679    }
7680  }
7681
7682  // Check the function attributes to see if implicit floats are allowed.
7683  // FIXME: This check doesn't seem possibly correct -- what if the loop is
7684  // an integer loop and the vector instructions selected are purely integer
7685  // vector instructions?
7686  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7687    reportVectorizationFailure(
7688        "Can't vectorize when the NoImplicitFloat attribute is used",
7689        "loop not vectorized due to NoImplicitFloat attribute",
7690        "NoImplicitFloat", ORE, L);
7691    Hints.emitRemarkWithHints();
7692    return false;
7693  }
7694
7695  // Check if the target supports potentially unsafe FP vectorization.
7696  // FIXME: Add a check for the type of safety issue (denormal, signaling)
7697  // for the target we're vectorizing for, to make sure none of the
7698  // additional fp-math flags can help.
7699  if (Hints.isPotentiallyUnsafe() &&
7700      TTI->isFPVectorizationPotentiallyUnsafe()) {
7701    reportVectorizationFailure(
7702        "Potentially unsafe FP op prevents vectorization",
7703        "loop not vectorized due to unsafe FP support.",
7704        "UnsafeFP", ORE, L);
7705    Hints.emitRemarkWithHints();
7706    return false;
7707  }
7708
7709  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7710  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7711
7712  // If an override option has been passed in for interleaved accesses, use it.
7713  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7714    UseInterleaved = EnableInterleavedMemAccesses;
7715
7716  // Analyze interleaved memory accesses.
7717  if (UseInterleaved) {
7718    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7719  }
7720
7721  // Use the cost model.
7722  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7723                                F, &Hints, IAI);
7724  CM.collectValuesToIgnore();
7725
7726  // Use the planner for vectorization.
7727  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7728
7729  // Get user vectorization factor.
7730  unsigned UserVF = Hints.getWidth();
7731
7732  // Plan how to best vectorize, return the best VF and its cost.
7733  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7734
7735  VectorizationFactor VF = VectorizationFactor::Disabled();
7736  unsigned IC = 1;
7737  unsigned UserIC = Hints.getInterleave();
7738
7739  if (MaybeVF) {
7740    VF = *MaybeVF;
7741    // Select the interleave count.
7742    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7743  }
7744
7745  // Identify the diagnostic messages that should be produced.
7746  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7747  bool VectorizeLoop = true, InterleaveLoop = true;
7748  if (Requirements.doesNotMeet(F, L, Hints)) {
7749    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7750                         "requirements.\n");
7751    Hints.emitRemarkWithHints();
7752    return false;
7753  }
7754
7755  if (VF.Width == 1) {
7756    LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7757    VecDiagMsg = std::make_pair(
7758        "VectorizationNotBeneficial",
7759        "the cost-model indicates that vectorization is not beneficial");
7760    VectorizeLoop = false;
7761  }
7762
7763  if (!MaybeVF && UserIC > 1) {
7764    // Tell the user interleaving was avoided up-front, despite being explicitly
7765    // requested.
7766    LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7767                         "interleaving should be avoided up front\n");
7768    IntDiagMsg = std::make_pair(
7769        "InterleavingAvoided",
7770        "Ignoring UserIC, because interleaving was avoided up front");
7771    InterleaveLoop = false;
7772  } else if (IC == 1 && UserIC <= 1) {
7773    // Tell the user interleaving is not beneficial.
7774    LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7775    IntDiagMsg = std::make_pair(
7776        "InterleavingNotBeneficial",
7777        "the cost-model indicates that interleaving is not beneficial");
7778    InterleaveLoop = false;
7779    if (UserIC == 1) {
7780      IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7781      IntDiagMsg.second +=
7782          " and is explicitly disabled or interleave count is set to 1";
7783    }
7784  } else if (IC > 1 && UserIC == 1) {
7785    // Tell the user interleaving is beneficial, but it explicitly disabled.
7786    LLVM_DEBUG(
7787        dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7788    IntDiagMsg = std::make_pair(
7789        "InterleavingBeneficialButDisabled",
7790        "the cost-model indicates that interleaving is beneficial "
7791        "but is explicitly disabled or interleave count is set to 1");
7792    InterleaveLoop = false;
7793  }
7794
7795  // Override IC if user provided an interleave count.
7796  IC = UserIC > 0 ? UserIC : IC;
7797
7798  // Emit diagnostic messages, if any.
7799  const char *VAPassName = Hints.vectorizeAnalysisPassName();
7800  if (!VectorizeLoop && !InterleaveLoop) {
7801    // Do not vectorize or interleaving the loop.
7802    ORE->emit([&]() {
7803      return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7804                                      L->getStartLoc(), L->getHeader())
7805             << VecDiagMsg.second;
7806    });
7807    ORE->emit([&]() {
7808      return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7809                                      L->getStartLoc(), L->getHeader())
7810             << IntDiagMsg.second;
7811    });
7812    return false;
7813  } else if (!VectorizeLoop && InterleaveLoop) {
7814    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7815    ORE->emit([&]() {
7816      return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7817                                        L->getStartLoc(), L->getHeader())
7818             << VecDiagMsg.second;
7819    });
7820  } else if (VectorizeLoop && !InterleaveLoop) {
7821    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7822                      << ") in " << DebugLocStr << '\n');
7823    ORE->emit([&]() {
7824      return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7825                                        L->getStartLoc(), L->getHeader())
7826             << IntDiagMsg.second;
7827    });
7828  } else if (VectorizeLoop && InterleaveLoop) {
7829    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7830                      << ") in " << DebugLocStr << '\n');
7831    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7832  }
7833
7834  LVP.setBestPlan(VF.Width, IC);
7835
7836  using namespace ore;
7837  bool DisableRuntimeUnroll = false;
7838  MDNode *OrigLoopID = L->getLoopID();
7839
7840  if (!VectorizeLoop) {
7841    assert(IC > 1 && "interleave count should not be 1 or 0");
7842    // If we decided that it is not legal to vectorize the loop, then
7843    // interleave it.
7844    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7845                               &CM);
7846    LVP.executePlan(Unroller, DT);
7847
7848    ORE->emit([&]() {
7849      return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7850                                L->getHeader())
7851             << "interleaved loop (interleaved count: "
7852             << NV("InterleaveCount", IC) << ")";
7853    });
7854  } else {
7855    // If we decided that it is *legal* to vectorize the loop, then do it.
7856    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7857                           &LVL, &CM);
7858    LVP.executePlan(LB, DT);
7859    ++LoopsVectorized;
7860
7861    // Add metadata to disable runtime unrolling a scalar loop when there are
7862    // no runtime checks about strides and memory. A scalar loop that is
7863    // rarely used is not worth unrolling.
7864    if (!LB.areSafetyChecksAdded())
7865      DisableRuntimeUnroll = true;
7866
7867    // Report the vectorization decision.
7868    ORE->emit([&]() {
7869      return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7870                                L->getHeader())
7871             << "vectorized loop (vectorization width: "
7872             << NV("VectorizationFactor", VF.Width)
7873             << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7874    });
7875  }
7876
7877  Optional<MDNode *> RemainderLoopID =
7878      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7879                                      LLVMLoopVectorizeFollowupEpilogue});
7880  if (RemainderLoopID.hasValue()) {
7881    L->setLoopID(RemainderLoopID.getValue());
7882  } else {
7883    if (DisableRuntimeUnroll)
7884      AddRuntimeUnrollDisableMetaData(L);
7885
7886    // Mark the loop as already vectorized to avoid vectorizing again.
7887    Hints.setAlreadyVectorized();
7888  }
7889
7890  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7891  return true;
7892}
7893
7894bool LoopVectorizePass::runImpl(
7895    Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7896    DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7897    DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7898    std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7899    OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7900  SE = &SE_;
7901  LI = &LI_;
7902  TTI = &TTI_;
7903  DT = &DT_;
7904  BFI = &BFI_;
7905  TLI = TLI_;
7906  AA = &AA_;
7907  AC = &AC_;
7908  GetLAA = &GetLAA_;
7909  DB = &DB_;
7910  ORE = &ORE_;
7911  PSI = PSI_;
7912
7913  // Don't attempt if
7914  // 1. the target claims to have no vector registers, and
7915  // 2. interleaving won't help ILP.
7916  //
7917  // The second condition is necessary because, even if the target has no
7918  // vector registers, loop vectorization may still enable scalar
7919  // interleaving.
7920  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7921      TTI->getMaxInterleaveFactor(1) < 2)
7922    return false;
7923
7924  bool Changed = false;
7925
7926  // The vectorizer requires loops to be in simplified form.
7927  // Since simplification may add new inner loops, it has to run before the
7928  // legality and profitability checks. This means running the loop vectorizer
7929  // will simplify all loops, regardless of whether anything end up being
7930  // vectorized.
7931  for (auto &L : *LI)
7932    Changed |=
7933        simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7934
7935  // Build up a worklist of inner-loops to vectorize. This is necessary as
7936  // the act of vectorizing or partially unrolling a loop creates new loops
7937  // and can invalidate iterators across the loops.
7938  SmallVector<Loop *, 8> Worklist;
7939
7940  for (Loop *L : *LI)
7941    collectSupportedLoops(*L, LI, ORE, Worklist);
7942
7943  LoopsAnalyzed += Worklist.size();
7944
7945  // Now walk the identified inner loops.
7946  while (!Worklist.empty()) {
7947    Loop *L = Worklist.pop_back_val();
7948
7949    // For the inner loops we actually process, form LCSSA to simplify the
7950    // transform.
7951    Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7952
7953    Changed |= processLoop(L);
7954  }
7955
7956  // Process each loop nest in the function.
7957  return Changed;
7958}
7959
7960PreservedAnalyses LoopVectorizePass::run(Function &F,
7961                                         FunctionAnalysisManager &AM) {
7962    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7963    auto &LI = AM.getResult<LoopAnalysis>(F);
7964    auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7965    auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7966    auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7967    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7968    auto &AA = AM.getResult<AAManager>(F);
7969    auto &AC = AM.getResult<AssumptionAnalysis>(F);
7970    auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7971    auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7972    MemorySSA *MSSA = EnableMSSALoopDependency
7973                          ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7974                          : nullptr;
7975
7976    auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7977    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7978        [&](Loop &L) -> const LoopAccessInfo & {
7979      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7980      return LAM.getResult<LoopAccessAnalysis>(L, AR);
7981    };
7982    const ModuleAnalysisManager &MAM =
7983        AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7984    ProfileSummaryInfo *PSI =
7985        MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7986    bool Changed =
7987        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7988    if (!Changed)
7989      return PreservedAnalyses::all();
7990    PreservedAnalyses PA;
7991
7992    // We currently do not preserve loopinfo/dominator analyses with outer loop
7993    // vectorization. Until this is addressed, mark these analyses as preserved
7994    // only for non-VPlan-native path.
7995    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7996    if (!EnableVPlanNativePath) {
7997      PA.preserve<LoopAnalysis>();
7998      PA.preserve<DominatorTreeAnalysis>();
7999    }
8000    PA.preserve<BasicAA>();
8001    PA.preserve<GlobalsAA>();
8002    return PA;
8003}
8004